/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved.                      *
* This file is part of the LIBXSMM library.                                   *
*                                                                             *
* For information on the license, see the LICENSE file.                       *
* Further information: https://github.com/libxsmm/libxsmm/                    *
* SPDX-License-Identifier: BSD-3-Clause                                       *
******************************************************************************/
/* Alexander Heinecke, Greg Henry (Intel Corp.)
******************************************************************************/
#ifndef GENERATOR_COMMON_H
#define GENERATOR_COMMON_H

#include <libxsmm_generator.h>
#include "libxsmm_matrixeqn.h"
#include "libxsmm_main.h"

/* TODO: check if we want to use enums here? Has this implications in the encoder? */
/* defining register mappings */
#define LIBXSMM_X86_GP_REG_RAX               0
#define LIBXSMM_X86_GP_REG_RCX               1
#define LIBXSMM_X86_GP_REG_RDX               2
#define LIBXSMM_X86_GP_REG_RBX               3
#define LIBXSMM_X86_GP_REG_RSP               4
#define LIBXSMM_X86_GP_REG_RBP               5
#define LIBXSMM_X86_GP_REG_RSI               6
#define LIBXSMM_X86_GP_REG_RDI               7
#define LIBXSMM_X86_GP_REG_R8                8
#define LIBXSMM_X86_GP_REG_R9                9
#define LIBXSMM_X86_GP_REG_R10              10
#define LIBXSMM_X86_GP_REG_R11              11
#define LIBXSMM_X86_GP_REG_R12              12
#define LIBXSMM_X86_GP_REG_R13              13
#define LIBXSMM_X86_GP_REG_R14              14
#define LIBXSMM_X86_GP_REG_R15              15
#define LIBXSMM_X86_GP_REG_UNDEF           127

/* define a place holder to handle AVX and SSE with a single encoder function
   using this values as the third operand means SSE */
#define LIBXSMM_X86_VEC_REG_UNDEF          255
#define LIBXSMM_X86_MASK_REG_UNDEF         255
#define LIBXSMM_X86_AVX512_MASK              1  /* this specifies k1 */

/* special value for undefined immediate */
#define LIBXSMM_X86_IMM_UNDEF             1024

/* special instruction */
#define LIBXSMM_X86_INSTR_UNDEF           9999

/*
 * 4-byte Integer Instruction Header definition map:
 * 4th byte:
 * ---------
 * 31 encoding mode (11=EVEX only, 10=REX, 01=VEX, 00=VEX/EVEX Hybrid)
 * 30 encoding mode (11=EVEX only, 10=REX, 01=VEX, 00=VEX/EVEX Hybrid)
 * 29 #operands (2 bits=0-3)
 * 28 #operands (2 bits=0-3)
 * 27 Reversal load/store ordering. 0=regular, 1=reverse (open question: is one bit enough, or do I need a couple bits to show other orderings)
 * 26 Op code extension in ModRM Regfiles (extension is bits 20-22)
 * 25 gather/scatter instructions with VSIB / enforce SIB addressing (valid only), e.g. AMX, in REX only -> force REX prefix
 * 24 used for pure/base REX/IA32 encodings to signal that the instructions skips the modrm byte and the opcode byte holds the register, used in EVEX mode as fake W' -> when set to 1 and W (bit 23) is 0 -> 16bit broadcast
 * 3rd byte:
 * ---------
 * 23 W bit (single inputs=0 or double inputs=1)
 * 22 Op code extension shifts in Reg field in ModRM (Shifts like /2, /4, /7, etc.. Maps to values 0-7, corresponding to /0 to /7.)
 * 21 Op code extension shifts in Reg field in ModRM (Shifts like /2, /4, /7, etc.. Maps to values 0-7, corresponding to /0 to /7.)
 * 20 Op code extension shifts in Reg field in ModRM (Shifts like /2, /4, /7, etc.. Maps to values 0-7, corresponding to /0 to /7.)
 * 19 Immediate required by the instruction. 0=no, 1=yes.
 * 18 Reserved, must be 1 for now
 * 17 P (compressed prefix 2-bits and 4 values: None=0x4, 66=0x5, F3=0x6, F2=0x7 , values include the reserved bit)
 * 16 P (compressed prefix 2-bits and 4 values: None=0x4, 66=0x5, F3=0x6, F2=0x7 , values include the reserved bit)
 * 2nd byte:
 * ---------
 * 15 Reserved, must be 0 for now
 * 14 Reserved, must be 0 for now for VEX/EVEX, for pure REX instruction, setting this bit issues an operand overwrite prefix (0x66)
 * 13 M (Map bit, 0F=0x1, 0F38=0x2, 0F3A=0x3)
 * 12 M (Map bit, 0F-0x1, 0F38=0x2, 0F3A=0x3)
 * EVEX/VEX:
 *   11 Disp8: N value constant for VL=128/256/512, 1=yes, 0=no (fullmem)
 *   10 Disp8: exp of width (0-6 values mapped to 1 to 64, 7 currently unused)
 *    9 Disp8: exp of width (0-6 values mapped to 1 to 64, 7 currently unused)
 *    8 Disp8: exp of width (0-6 values mapped to 1 to 64, 7 currently unused)
 * VEX-only
 *   11 free
 *   10 free
 *    9 L bit
 *    8 1: override L bit, 0 use L bit from user
 * 1st byte:
 * ---------
 *  7 - Op code byte
 *  6 - Op code byte
 *  5 - Op code byte
 *  4 - Op code byte
 *  3 - Op code byte
 *  2 - Op code byte
 *  1 - Op code byte
 *  0 - Op code byte
 */

/* Load/Store/Move instructions -  AVX1,AVX2,AVX512 - deprecated values */
#define LIBXSMM_X86_INSTR_VMOVAPD          0x20851628
#define LIBXSMM_X86_INSTR_VMOVUPD          0x20851610
#define LIBXSMM_X86_INSTR_VMOVAPS          0x20041628
#define LIBXSMM_X86_INSTR_VMOVUPS          0x20041610
#define LIBXSMM_X86_INSTR_VMOVSD           0x20871b10
#define LIBXSMM_X86_INSTR_VMOVSS           0x20061a10
#define LIBXSMM_X86_INSTR_VMASKMOVPD       0x7005202d
#define LIBXSMM_X86_INSTR_VMASKMOVPS       0x7005202c
#define LIBXSMM_X86_INSTR_VMOVDQA32        0xe005166f
#define LIBXSMM_X86_INSTR_VMOVDQA64        0xe085166f
#define LIBXSMM_X86_INSTR_VMOVDQU8         0xe007166f
#define LIBXSMM_X86_INSTR_VMOVDQU16        0xe087166f
#define LIBXSMM_X86_INSTR_VMOVDQU32        0xe006166f
#define LIBXSMM_X86_INSTR_VMOVDQU64        0xe086166f
/* Load instructions -  AVX,AVX2,AVX512 */
#define LIBXSMM_X86_INSTR_VMOVDDUP         0x20871612
#define LIBXSMM_X86_INSTR_VBROADCASTSD     0x20852b19
#define LIBXSMM_X86_INSTR_VBROADCASTSD_VEX 0x60052019
#define LIBXSMM_X86_INSTR_VBROADCASTSS     0x20052a18
#define LIBXSMM_X86_INSTR_VPBROADCASTB     0x20052878
#define LIBXSMM_X86_INSTR_VPBROADCASTW     0x20052979
#define LIBXSMM_X86_INSTR_VPBROADCASTD     0x20052a58
#define LIBXSMM_X86_INSTR_VPBROADCASTQ     0xe0852b59
#define LIBXSMM_X86_INSTR_VPBROADCASTQ_VEX 0x60052059
#define LIBXSMM_X86_INSTR_VPBROADCASTB_GPR 0xe005287a
#define LIBXSMM_X86_INSTR_VPBROADCASTW_GPR 0xe005297b
#define LIBXSMM_X86_INSTR_VPBROADCASTD_GPR 0xe0052a7c
#define LIBXSMM_X86_INSTR_VPBROADCASTQ_GPR 0xe0852b7c
#define LIBXSMM_X86_INSTR_VMOVAPD_LD       0x20851628
#define LIBXSMM_X86_INSTR_VMOVUPD_LD       0x20851610
#define LIBXSMM_X86_INSTR_VMOVAPS_LD       0x20041628
#define LIBXSMM_X86_INSTR_VMOVUPS_LD       0x20041610
#define LIBXSMM_X86_INSTR_VMOVSD_LD        0x20871b10
#define LIBXSMM_X86_INSTR_VMOVSS_LD        0x20061a10
#define LIBXSMM_X86_INSTR_VMASKMOVPD_LD    0x7005202d
#define LIBXSMM_X86_INSTR_VMASKMOVPS_LD    0x7005202c
#define LIBXSMM_X86_INSTR_VMOVDQA32_LD     0xe005166f
#define LIBXSMM_X86_INSTR_VMOVDQA64_LD     0xe085166f
#define LIBXSMM_X86_INSTR_VMOVDQU8_LD      0xe007166f
#define LIBXSMM_X86_INSTR_VMOVDQU16_LD     0xe087166f
#define LIBXSMM_X86_INSTR_VMOVDQU32_LD     0xe006166f
#define LIBXSMM_X86_INSTR_VMOVDQU64_LD     0xe086166f
#define LIBXSMM_X86_INSTR_VBROADCASTI128   0x6005205a
#define LIBXSMM_X86_INSTR_VBROADCASTI32X2  0xe0052b59
#define LIBXSMM_X86_INSTR_VBROADCASTI32X4  0xe0052c5a
#define LIBXSMM_X86_INSTR_VBROADCASTI64X2  0xe0852c5a
#define LIBXSMM_X86_INSTR_VBROADCASTI32X8  0xe0052d5b
#define LIBXSMM_X86_INSTR_VBROADCASTI64X4  0xe0852d5b
#define LIBXSMM_X86_INSTR_VMOVD_LD         0x20051a6e
#define LIBXSMM_X86_INSTR_VMOVQ_LD         0x20851b6e
/* Store instructions - AVX,AVX2,AVX512 */
#define LIBXSMM_X86_INSTR_VMOVNTPD         0x2085162b
#define LIBXSMM_X86_INSTR_VMOVNTPS         0x2004162b
#define LIBXSMM_X86_INSTR_VMOVNTDQ         0x200516e7
#define LIBXSMM_X86_INSTR_VMOVAPD_ST       0x20851629
#define LIBXSMM_X86_INSTR_VMOVUPD_ST       0x20851611
#define LIBXSMM_X86_INSTR_VMOVAPS_ST       0x20041629
#define LIBXSMM_X86_INSTR_VMOVUPS_ST       0x20041611
#define LIBXSMM_X86_INSTR_VMOVSD_ST        0x20871b11
#define LIBXSMM_X86_INSTR_VMOVSS_ST        0x20061a11
#define LIBXSMM_X86_INSTR_VMASKMOVPD_ST    0x7005202f
#define LIBXSMM_X86_INSTR_VMASKMOVPS_ST    0x7005202e
#define LIBXSMM_X86_INSTR_VMOVDQA32_ST     0xe005167f
#define LIBXSMM_X86_INSTR_VMOVDQA64_ST     0xe085167f
#define LIBXSMM_X86_INSTR_VMOVDQU8_ST      0xe007167f
#define LIBXSMM_X86_INSTR_VMOVDQU16_ST     0xe087167f
#define LIBXSMM_X86_INSTR_VMOVDQU32_ST     0xe006167f
#define LIBXSMM_X86_INSTR_VMOVDQU64_ST     0xe086167f
#define LIBXSMM_X86_INSTR_VMOVD_ST         0x20051a7e
#define LIBXSMM_X86_INSTR_VMOVQ_ST         0x20851b7e
/* Gather/Scatter instructions */
#define LIBXSMM_X86_INSTR_VGATHERDPS_VEX   0x72052092
#define LIBXSMM_X86_INSTR_VGATHERDPD_VEX   0x72852092
#define LIBXSMM_X86_INSTR_VGATHERQPS_VEX   0x72052093
#define LIBXSMM_X86_INSTR_VGATHERQPD_VEX   0x72852093
#define LIBXSMM_X86_INSTR_VPGATHERDD_VEX   0x72052090
#define LIBXSMM_X86_INSTR_VPGATHERDQ_VEX   0x72852090
#define LIBXSMM_X86_INSTR_VPGATHERQD_VEX   0x72052091
#define LIBXSMM_X86_INSTR_VPGATHERQQ_VEX   0x72852091
#define LIBXSMM_X86_INSTR_VGATHERDPS       0xe2052a92
#define LIBXSMM_X86_INSTR_VGATHERDPD       0xe2852b92
#define LIBXSMM_X86_INSTR_VGATHERQPS       0xe2052a93
#define LIBXSMM_X86_INSTR_VGATHERQPD       0xe2852b93
#define LIBXSMM_X86_INSTR_VPGATHERDD       0xe2052a90
#define LIBXSMM_X86_INSTR_VPGATHERDQ       0xe2852b90
#define LIBXSMM_X86_INSTR_VPGATHERQD       0xe2052a91
#define LIBXSMM_X86_INSTR_VPGATHERQQ       0xe2852b91
#define LIBXSMM_X86_INSTR_VSCATTERDPS      0xe2052aa2
#define LIBXSMM_X86_INSTR_VSCATTERDPD      0xe2852ba2
#define LIBXSMM_X86_INSTR_VSCATTERQPS      0xe2052aa3
#define LIBXSMM_X86_INSTR_VSCATTERQPD      0xe2852ba3
#define LIBXSMM_X86_INSTR_VPSCATTERDD      0xe2952aa0
#define LIBXSMM_X86_INSTR_VPSCATTERDQ      0xe2852ba0
#define LIBXSMM_X86_INSTR_VPSCATTERQD      0xe2052aa1
#define LIBXSMM_X86_INSTR_VPSCATTERQQ      0xe2852ba1

/* Shuffle/Permute/Blend instructions */
/* VEX and EVEX */
#define LIBXSMM_X86_INSTR_VSHUFPS          0x300c16c6
#define LIBXSMM_X86_INSTR_VSHUFPD          0x308d16c6
#define LIBXSMM_X86_INSTR_VPSHUFB          0x30052600
#define LIBXSMM_X86_INSTR_VPSHUFD          0x200d1670
#define LIBXSMM_X86_INSTR_VPSHUFHW         0x200e1670
#define LIBXSMM_X86_INSTR_VPSHUFLW         0x200f1670
#define LIBXSMM_X86_INSTR_VUNPCKLPD        0x30851614
#define LIBXSMM_X86_INSTR_VUNPCKLPS        0x30041614
#define LIBXSMM_X86_INSTR_VUNPCKHPD        0x30851615
#define LIBXSMM_X86_INSTR_VUNPCKHPS        0x30041615
#define LIBXSMM_X86_INSTR_VPUNPCKLBW       0x30051660
#define LIBXSMM_X86_INSTR_VPUNPCKHBW       0x30051668
#define LIBXSMM_X86_INSTR_VPUNPCKLWD       0x30051661
#define LIBXSMM_X86_INSTR_VPUNPCKHWD       0x30051669
#define LIBXSMM_X86_INSTR_VPUNPCKLDQ       0x30051662
#define LIBXSMM_X86_INSTR_VPUNPCKHDQ       0x3005166a
#define LIBXSMM_X86_INSTR_VPUNPCKLQDQ      0x3085166c
#define LIBXSMM_X86_INSTR_VPUNPCKHQDQ      0x3085166d
#define LIBXSMM_X86_INSTR_VPERMD           0x30052636
#define LIBXSMM_X86_INSTR_VPERMQ_I         0x208d3e00
#define LIBXSMM_X86_INSTR_VPERMPS          0x30052516
#define LIBXSMM_X86_INSTR_VPERMPD_I        0x208d3601
#define LIBXSMM_X86_INSTR_VPERMILPS        0x3005250c
#define LIBXSMM_X86_INSTR_VPERMILPS_I      0x200d3504
#define LIBXSMM_X86_INSTR_VPEXTRB          0x280d3814
#define LIBXSMM_X86_INSTR_VPEXTRD          0x280d3a16
#define LIBXSMM_X86_INSTR_VPEXTRQ          0x288d3b16
#define LIBXSMM_X86_INSTR_VPINSRB          0x300d3820
#define LIBXSMM_X86_INSTR_VPINSRD          0x300d3a22
#define LIBXSMM_X86_INSTR_VPINSRQ          0x308d3b22
/* VEX only */
#define LIBXSMM_X86_INSTR_VPERM2F128       0x700d3006
#define LIBXSMM_X86_INSTR_VPERM2I128       0x700d3046
#define LIBXSMM_X86_INSTR_VEXTRACTF128     0x680d3019
#define LIBXSMM_X86_INSTR_VEXTRACTI128     0x680d3039
#define LIBXSMM_X86_INSTR_VPERMILPD_VEX    0x7005200d
#define LIBXSMM_X86_INSTR_VPERMILPD_VEX_I  0x600d3005
#define LIBXSMM_X86_INSTR_VBLENDPD         0x700d300d
#define LIBXSMM_X86_INSTR_VBLENDPS         0x700d300c
#define LIBXSMM_X86_INSTR_VBLENDVPD        0x700d304b
#define LIBXSMM_X86_INSTR_VBLENDVPS        0x700d304a
#define LIBXSMM_X86_INSTR_VPBLENDD         0x700d3002
#define LIBXSMM_X86_INSTR_VPBLENDW         0x700d300e
#define LIBXSMM_X86_INSTR_VPBLENDVB        0x700d304c
#define LIBXSMM_X86_INSTR_VMOVMSKPD        0x60051050
#define LIBXSMM_X86_INSTR_VMOVMSKPS        0x60041050
#define LIBXSMM_X86_INSTR_VPMOVMSKB        0x600510d7
/* EVEX only */
#define LIBXSMM_X86_INSTR_VSHUFF32X4       0xf00d3623
#define LIBXSMM_X86_INSTR_VSHUFF64X2       0xf08d3623
#define LIBXSMM_X86_INSTR_VSHUFI32X4       0xf00d3643
#define LIBXSMM_X86_INSTR_VSHUFI64X2       0xf08d3643
#define LIBXSMM_X86_INSTR_VEXTRACTF32X4    0xe80d3c19
#define LIBXSMM_X86_INSTR_VEXTRACTF64X2    0xe88d3c19
#define LIBXSMM_X86_INSTR_VEXTRACTF32X8    0xe80d3d1b
#define LIBXSMM_X86_INSTR_VEXTRACTF64X4    0xe88d3d1b
#define LIBXSMM_X86_INSTR_VEXTRACTI32X4    0xe80d3c39
#define LIBXSMM_X86_INSTR_VEXTRACTI64X2    0xe88d3c39
#define LIBXSMM_X86_INSTR_VEXTRACTI32X8    0xe80d3d3b
#define LIBXSMM_X86_INSTR_VEXTRACTI64X4    0xe88d3d3b
#define LIBXSMM_X86_INSTR_VINSERTF32X4     0xf00d3c18
#define LIBXSMM_X86_INSTR_VINSERTF64X2     0xf08d3c18
#define LIBXSMM_X86_INSTR_VINSERTF32X8     0xf00d3d1a
#define LIBXSMM_X86_INSTR_VINSERTF64X4     0xf08d3d1a
#define LIBXSMM_X86_INSTR_VINSERTI32X4     0xf00d3c38
#define LIBXSMM_X86_INSTR_VINSERTI64X2     0xf08d3c38
#define LIBXSMM_X86_INSTR_VINSERTI32X8     0xf00d3d3a
#define LIBXSMM_X86_INSTR_VINSERTI64X4     0xf08d3d3a
#define LIBXSMM_X86_INSTR_VBLENDMPS        0xf0052665
#define LIBXSMM_X86_INSTR_VBLENDMPD        0xf0852665
#define LIBXSMM_X86_INSTR_VPBLENDMB        0xf0052666
#define LIBXSMM_X86_INSTR_VPBLENDMW        0xf0852666
#define LIBXSMM_X86_INSTR_VPBLENDMD        0xf0052664
#define LIBXSMM_X86_INSTR_VPBLENDMQ        0xf0852664
#define LIBXSMM_X86_INSTR_VEXPANDPD        0xe0852b88
#define LIBXSMM_X86_INSTR_VEXPANDPS        0xe0052a88
#define LIBXSMM_X86_INSTR_VPEXPANDQ        0xe0852b89
#define LIBXSMM_X86_INSTR_VPEXPANDD        0xe0052a89
#define LIBXSMM_X86_INSTR_VPEXPANDW        0xe0852962
#define LIBXSMM_X86_INSTR_VPEXPANDB        0xe0052862
#define LIBXSMM_X86_INSTR_VPERMB           0xf005268d
#define LIBXSMM_X86_INSTR_VPERMW           0xf085268d
#define LIBXSMM_X86_INSTR_VPERMQ           0xf0852e36
#define LIBXSMM_X86_INSTR_VPERMPD          0xf0852616
#define LIBXSMM_X86_INSTR_VPERMILPD        0xf085260d
#define LIBXSMM_X86_INSTR_VPERMILPD_I      0xe08d3605
#define LIBXSMM_X86_INSTR_VPERMT2B         0xf005267d
#define LIBXSMM_X86_INSTR_VPERMT2W         0xf085267d
#define LIBXSMM_X86_INSTR_VPERMT2D         0xf005267e
#define LIBXSMM_X86_INSTR_VPERMT2Q         0xf085267e
#define LIBXSMM_X86_INSTR_VPERMT2PS        0xf005267f
#define LIBXSMM_X86_INSTR_VPERMT2PD        0xf085267f
#define LIBXSMM_X86_INSTR_VPERMI2B         0xf0052675
#define LIBXSMM_X86_INSTR_VPERMI2W         0xf0852675
#define LIBXSMM_X86_INSTR_VPERMI2D         0xf0052676
#define LIBXSMM_X86_INSTR_VPERMI2Q         0xf0852676
#define LIBXSMM_X86_INSTR_VPERMI2PS        0xf0052677
#define LIBXSMM_X86_INSTR_VPERMI2PD        0xf0852677

/* FMA instructions */
#define LIBXSMM_X86_INSTR_VFMADD132PS      0x30052698
#define LIBXSMM_X86_INSTR_VFMADD132PD      0x30852698
#define LIBXSMM_X86_INSTR_VFMADD213PS      0x300526a8
#define LIBXSMM_X86_INSTR_VFMADD213PD      0x308526a8
#define LIBXSMM_X86_INSTR_VFMADD231PS      0x300526b8
#define LIBXSMM_X86_INSTR_VFMADD231PD      0x308526b8
#define LIBXSMM_X86_INSTR_VFMSUB132PS      0x3005269a
#define LIBXSMM_X86_INSTR_VFMSUB132PD      0x3085269a
#define LIBXSMM_X86_INSTR_VFMSUB213PS      0x300526aa
#define LIBXSMM_X86_INSTR_VFMSUB213PD      0x308526aa
#define LIBXSMM_X86_INSTR_VFMSUB231PS      0x300526ba
#define LIBXSMM_X86_INSTR_VFMSUB231PD      0x308526ba
#define LIBXSMM_X86_INSTR_VFNMADD132PS     0x3005269c
#define LIBXSMM_X86_INSTR_VFNMADD132PD     0x3085269c
#define LIBXSMM_X86_INSTR_VFNMADD213PS     0x300526ac
#define LIBXSMM_X86_INSTR_VFNMADD213PD     0x308526ac
#define LIBXSMM_X86_INSTR_VFNMADD231PS     0x300526bc
#define LIBXSMM_X86_INSTR_VFNMADD231PD     0x308526bc
#define LIBXSMM_X86_INSTR_VFNMSUB132PS     0x3005269e
#define LIBXSMM_X86_INSTR_VFNMSUB132PD     0x3085269e
#define LIBXSMM_X86_INSTR_VFNMSUB213PS     0x300526ae
#define LIBXSMM_X86_INSTR_VFNMSUB213PD     0x308526ae
#define LIBXSMM_X86_INSTR_VFNMSUB231PS     0x300526be
#define LIBXSMM_X86_INSTR_VFNMSUB231PD     0x308526be
#define LIBXSMM_X86_INSTR_VFMADD132SD      0x30852b99
#define LIBXSMM_X86_INSTR_VFMADD213SD      0x30852ba9
#define LIBXSMM_X86_INSTR_VFMADD231SD      0x30852bb9
#define LIBXSMM_X86_INSTR_VFMADD132SS      0x30052a99
#define LIBXSMM_X86_INSTR_VFMADD213SS      0x30052aa9
#define LIBXSMM_X86_INSTR_VFMADD231SS      0x30052ab9
#define LIBXSMM_X86_INSTR_VFMSUB132SD      0x30852b9b
#define LIBXSMM_X86_INSTR_VFMSUB213SD      0x30852bab
#define LIBXSMM_X86_INSTR_VFMSUB231SD      0x30852bbb
#define LIBXSMM_X86_INSTR_VFMSUB132SS      0x30052a9b
#define LIBXSMM_X86_INSTR_VFMSUB213SS      0x30052aab
#define LIBXSMM_X86_INSTR_VFMSUB231SS      0x30052abb
#define LIBXSMM_X86_INSTR_VFNMADD132SD     0x30852b9d
#define LIBXSMM_X86_INSTR_VFNMADD213SD     0x30852bad
#define LIBXSMM_X86_INSTR_VFNMADD231SD     0x30852bbd
#define LIBXSMM_X86_INSTR_VFNMADD132SS     0x30052a9d
#define LIBXSMM_X86_INSTR_VFNMADD213SS     0x30052aad
#define LIBXSMM_X86_INSTR_VFNMADD231SS     0x30052abd
#define LIBXSMM_X86_INSTR_VFNMSUB132SD     0x30852b9f
#define LIBXSMM_X86_INSTR_VFNMSUB213SD     0x30852baf
#define LIBXSMM_X86_INSTR_VFNMSUB231SD     0x30852bbf
#define LIBXSMM_X86_INSTR_VFNMSUB132SS     0x30052a9f
#define LIBXSMM_X86_INSTR_VFNMSUB213SS     0x30052aaf
#define LIBXSMM_X86_INSTR_VFNMSUB231SS     0x30052abf

/* floating point helpers, VEX */
#define LIBXSMM_X86_INSTR_VROUNDPD         0x600d3009
#define LIBXSMM_X86_INSTR_VROUNDSD         0x700d300b
#define LIBXSMM_X86_INSTR_VROUNDPS         0x600d3008
#define LIBXSMM_X86_INSTR_VROUNDSS         0x700d300a
#define LIBXSMM_X86_INSTR_VRCPPS           0x60041053
#define LIBXSMM_X86_INSTR_VRCPSS           0x70061053
#define LIBXSMM_X86_INSTR_VRSQRTPS         0x60041052
#define LIBXSMM_X86_INSTR_VRSQRTSS         0x70061052

/* floating point helpers, EVEX */
#define LIBXSMM_X86_INSTR_VRANGEPS         0xf00d3650
#define LIBXSMM_X86_INSTR_VRANGEPD         0xf08d3650
#define LIBXSMM_X86_INSTR_VRANGESS         0xf00d3a51
#define LIBXSMM_X86_INSTR_VRANGESD         0xf08d3b51
#define LIBXSMM_X86_INSTR_VREDUCEPS        0xe00d3656
#define LIBXSMM_X86_INSTR_VREDUCEPD        0xe08d3656
#define LIBXSMM_X86_INSTR_VREDUCESS        0xf00d3a57
#define LIBXSMM_X86_INSTR_VREDUCESD        0xf08d3b57
#define LIBXSMM_X86_INSTR_VRCP14PS         0xe005264c
#define LIBXSMM_X86_INSTR_VRCP14PD         0xe085264c
#define LIBXSMM_X86_INSTR_VRCP14SS         0xf0052a4d
#define LIBXSMM_X86_INSTR_VRCP14SD         0xf0852b4d
#define LIBXSMM_X86_INSTR_VRNDSCALEPS      0xe00d3608
#define LIBXSMM_X86_INSTR_VRNDSCALEPD      0xe08d3609
#define LIBXSMM_X86_INSTR_VRNDSCALESS      0xf00d3a0a
#define LIBXSMM_X86_INSTR_VRNDSCALESD      0xf08d3b0b
#define LIBXSMM_X86_INSTR_VRSQRT14PS       0xe005264e
#define LIBXSMM_X86_INSTR_VRSQRT14PD       0xe085264e
#define LIBXSMM_X86_INSTR_VRSQRT14SS       0xf0052a4f
#define LIBXSMM_X86_INSTR_VRSQRT14SD       0xf0852b4f
#define LIBXSMM_X86_INSTR_VSCALEFPS        0xf005262c
#define LIBXSMM_X86_INSTR_VSCALEFPD        0xf085262c
#define LIBXSMM_X86_INSTR_VSCALEFSS        0xf0052a2d
#define LIBXSMM_X86_INSTR_VSCALEFSD        0xf0852b2d

/* compare instructions */
#define LIBXSMM_X86_INSTR_VCMPPS           0x300c16c2
#define LIBXSMM_X86_INSTR_VCMPSS           0x300e1ac2
#define LIBXSMM_X86_INSTR_VCMPPD           0x308d16c2
#define LIBXSMM_X86_INSTR_VCMPSD           0x308f1bc2
#define LIBXSMM_X86_INSTR_VPCMPB           0xf00d363f
#define LIBXSMM_X86_INSTR_VPCMPUB          0xf00d363e
#define LIBXSMM_X86_INSTR_VPCMPW           0xf08d363f
#define LIBXSMM_X86_INSTR_VPCMPUW          0xf08d363e
#define LIBXSMM_X86_INSTR_VPCMPD           0xf00d361f
#define LIBXSMM_X86_INSTR_VPCMPUD          0xf00d361e
#define LIBXSMM_X86_INSTR_VPCMPQ           0xf08d361f
#define LIBXSMM_X86_INSTR_VPCMPUQ          0xf08d361e
#define LIBXSMM_X86_INSTR_VPCMPEQB         0x30051674
#define LIBXSMM_X86_INSTR_VPCMPEQW         0x30051675
#define LIBXSMM_X86_INSTR_VPCMPEQD         0x30051676
#define LIBXSMM_X86_INSTR_VPCMPEQQ         0x30852629
#define LIBXSMM_X86_INSTR_VPCMPGTB         0x30051664
#define LIBXSMM_X86_INSTR_VPCMPGTW         0x30051665
#define LIBXSMM_X86_INSTR_VPCMPGTD         0x30051666
#define LIBXSMM_X86_INSTR_VPCMPGTQ         0x30852637
#define LIBXSMM_X86_INSTR_VPCMPESTRI       0x600d3061
#define LIBXSMM_X86_INSTR_VPCMPESTRM       0x600d3060
#define LIBXSMM_X86_INSTR_VPCMPISTRI       0x600d3063
#define LIBXSMM_X86_INSTR_VPCMPISTRM       0x600d3062

/* convert instructions */
#define LIBXSMM_X86_INSTR_VCVTPS2PD        0x2004155a
#define LIBXSMM_X86_INSTR_VCVTPH2PS        0x20052513
#define LIBXSMM_X86_INSTR_VCVTPS2PH        0x280d351d
#define LIBXSMM_X86_INSTR_VCVTDQ2PS        0x2004165b
#define LIBXSMM_X86_INSTR_VCVTPS2DQ        0x2005165b
#define LIBXSMM_X86_INSTR_VCVTPS2UDQ       0x20041679
#define LIBXSMM_X86_INSTR_VPMOVDW          0x28062533
#define LIBXSMM_X86_INSTR_VPMOVSXWD        0x20052523
#define LIBXSMM_X86_INSTR_VPMOVDB          0x28062431
#define LIBXSMM_X86_INSTR_VPMOVSDB         0x28062421
#define LIBXSMM_X86_INSTR_VPMOVSDW         0x28062523
#define LIBXSMM_X86_INSTR_VPMOVUSDB        0x28062411
#define LIBXSMM_X86_INSTR_VPMOVUSDW        0x28062513
#define LIBXSMM_X86_INSTR_VPMOVZXWD        0x20052533
#define LIBXSMM_X86_INSTR_VPMOVSXBW        0x20052520
#define LIBXSMM_X86_INSTR_VPMOVZXBW        0x20052530
#define LIBXSMM_X86_INSTR_VPMOVSXBD        0x20052421
#define LIBXSMM_X86_INSTR_VPMOVZXBD        0x20052431
#define LIBXSMM_X86_INSTR_VPMOVUSWB        0xe0062510
#define LIBXSMM_X86_INSTR_VPMOVSWB         0xe0062520
#define LIBXSMM_X86_INSTR_VPMOVWB          0xe0062530
#define LIBXSMM_X86_INSTR_VPACKSSWB        0x30051663
#define LIBXSMM_X86_INSTR_VPACKSSDW        0x3005166b
#define LIBXSMM_X86_INSTR_VPACKUSWB        0x30051667
#define LIBXSMM_X86_INSTR_VPACKUSDW        0x3005262b

/* shift instructions */
#define LIBXSMM_X86_INSTR_VPSLLD_I         0x246d1672
#define LIBXSMM_X86_INSTR_VPSLLW_I         0x246d1671
#define LIBXSMM_X86_INSTR_VPSRAD_I         0x244d1672
#define LIBXSMM_X86_INSTR_VPSRAW_I         0x244d1671
#define LIBXSMM_X86_INSTR_VPSRLD_I         0x242d1672
#define LIBXSMM_X86_INSTR_VPSLLVW          0x30852612
#define LIBXSMM_X86_INSTR_VPSLLVD          0x30052647
#define LIBXSMM_X86_INSTR_VPSLLVQ          0x30852647
#define LIBXSMM_X86_INSTR_VPSRAVW          0x30852611
#define LIBXSMM_X86_INSTR_VPSRAVD          0x30052646
#define LIBXSMM_X86_INSTR_VPSRAVQ          0x30852646
#define LIBXSMM_X86_INSTR_VPSRLVW          0x30852610
#define LIBXSMM_X86_INSTR_VPSRLVD          0x30052645
#define LIBXSMM_X86_INSTR_VPSRLVQ          0x30852645

/* floating point compute */
#define LIBXSMM_X86_INSTR_VXORPD           0x30851657
#define LIBXSMM_X86_INSTR_VADDPD           0x30851658
#define LIBXSMM_X86_INSTR_VMULPD           0x30851659
#define LIBXSMM_X86_INSTR_VSUBPD           0x3085165c
#define LIBXSMM_X86_INSTR_VDIVPD           0x3085165e
#define LIBXSMM_X86_INSTR_VMINPD           0x3085165d
#define LIBXSMM_X86_INSTR_VMAXPD           0x3085165f
#define LIBXSMM_X86_INSTR_VSQRTPD          0x20851651
#define LIBXSMM_X86_INSTR_VADDSD           0x30871b58
#define LIBXSMM_X86_INSTR_VMULSD           0x30871b59
#define LIBXSMM_X86_INSTR_VSUBSD           0x30871b5c
#define LIBXSMM_X86_INSTR_VDIVSD           0x30871b5e
#define LIBXSMM_X86_INSTR_VMINSD           0x30871b5d
#define LIBXSMM_X86_INSTR_VMAXSD           0x30871b5f
#define LIBXSMM_X86_INSTR_VSQRTSD          0x30871b51

#define LIBXSMM_X86_INSTR_VXORPS           0x30041657
#define LIBXSMM_X86_INSTR_VADDPS           0x30041658
#define LIBXSMM_X86_INSTR_VMULPS           0x30041659
#define LIBXSMM_X86_INSTR_VSUBPS           0x3004165c
#define LIBXSMM_X86_INSTR_VDIVPS           0x3004165e
#define LIBXSMM_X86_INSTR_VMINPS           0x3004155d
#define LIBXSMM_X86_INSTR_VMAXPS           0x3004165f
#define LIBXSMM_X86_INSTR_VSQRTPS          0x20041551
#define LIBXSMM_X86_INSTR_VMULSS           0x30061a59
#define LIBXSMM_X86_INSTR_VADDSS           0x30061a58
#define LIBXSMM_X86_INSTR_VSUBSS           0x30061a5c
#define LIBXSMM_X86_INSTR_VDIVSS           0x30061a5e
#define LIBXSMM_X86_INSTR_VMINSS           0x30061a5d
#define LIBXSMM_X86_INSTR_VMAXSS           0x30061a5f
#define LIBXSMM_X86_INSTR_VSQRTSS          0x30061a51

/* integer compute */
#define LIBXSMM_X86_INSTR_VPXORD           0x300516ef
#define LIBXSMM_X86_INSTR_VPORD            0x300516eb
#define LIBXSMM_X86_INSTR_VPXORQ           0x308516ef
#define LIBXSMM_X86_INSTR_VPORQ            0x308516eb
#define LIBXSMM_X86_INSTR_VPANDD           0x300516db
#define LIBXSMM_X86_INSTR_VPANDQ           0x308516db
#define LIBXSMM_X86_INSTR_VPADDQ           0x308516d4
#define LIBXSMM_X86_INSTR_VPADDB           0x300516fc
#define LIBXSMM_X86_INSTR_VPADDW           0x300516fd
#define LIBXSMM_X86_INSTR_VPADDD           0x300516fe
#define LIBXSMM_X86_INSTR_VPMADDWD         0x300516f5
#define LIBXSMM_X86_INSTR_VPMADDUBSW       0x30052604
#define LIBXSMM_X86_INSTR_VPADDSW          0x300516ed
#define LIBXSMM_X86_INSTR_VPADDSB          0x300516ec
#define LIBXSMM_X86_INSTR_VPSUBD           0x300516fa
#define LIBXSMM_X86_INSTR_VPSUBW           0x300516f9
#define LIBXSMM_X86_INSTR_VPSUBB           0x300516f8
#define LIBXSMM_X86_INSTR_VPMAXSD          0x3005263d
#define LIBXSMM_X86_INSTR_VPMAXSW          0x300516ee
#define LIBXSMM_X86_INSTR_VPMINSD          0x30052639

/* AVX512 VNNI */
#define LIBXSMM_X86_INSTR_VPDPBUSD         0x30052650
#define LIBXSMM_X86_INSTR_VPDPBUSDS        0x30052651
#define LIBXSMM_X86_INSTR_VPDPWSSD         0x30052652
#define LIBXSMM_X86_INSTR_VPDPWSSDS        0x30052653

/* AVX512 BF16 */
#define LIBXSMM_X86_INSTR_VDPBF16PS        0xf0062652
#define LIBXSMM_X86_INSTR_VCVTNEPS2BF16    0x20062672
#define LIBXSMM_X86_INSTR_VCVTNE2PS2BF16   0xf0072672

/* AVX512 FP16 */
#define LIBXSMM_X86_INSTR_VADDPH           0xf1045658
#define LIBXSMM_X86_INSTR_VADDSH           0xf0065958
#define LIBXSMM_X86_INSTR_VCMPPH           0xf10c36c2
#define LIBXSMM_X86_INSTR_VCMPSH           0xf00e39c2
#define LIBXSMM_X86_INSTR_VDIVPH           0xf104565e
#define LIBXSMM_X86_INSTR_VDIVSH           0xf006595e
#define LIBXSMM_X86_INSTR_VFCMADDCPH       0xf0076656
#define LIBXSMM_X86_INSTR_VFMADDCPH        0xf0066656
#define LIBXSMM_X86_INSTR_VFCMADDCSH       0xf0076a57
#define LIBXSMM_X86_INSTR_VFMADDCSH        0xf0066a57
#define LIBXSMM_X86_INSTR_VFCMULCPH        0xf00766d6
#define LIBXSMM_X86_INSTR_VFMULCPH         0xf00666d6
#define LIBXSMM_X86_INSTR_VFCMULCSH        0xf0076ad7
#define LIBXSMM_X86_INSTR_VFMULCSH         0xf0066ad7
#define LIBXSMM_X86_INSTR_VFMADDSUB132PH   0xf1056696
#define LIBXSMM_X86_INSTR_VFMADDSUB213PH   0xf10566a6
#define LIBXSMM_X86_INSTR_VFMADDSUB231PH   0xf10566b6
#define LIBXSMM_X86_INSTR_VFMSUBADD132PH   0xf1056697
#define LIBXSMM_X86_INSTR_VFMSUBADD213PH   0xf10566a7
#define LIBXSMM_X86_INSTR_VFMSUBADD231PH   0xf10566b7
#define LIBXSMM_X86_INSTR_VFMADD132PH      0xf1056698
#define LIBXSMM_X86_INSTR_VFMADD213PH      0xf10566a8
#define LIBXSMM_X86_INSTR_VFMADD231PH      0xf10566b8
#define LIBXSMM_X86_INSTR_VFNMADD132PH     0xf105669c
#define LIBXSMM_X86_INSTR_VFNMADD213PH     0xf10566ac
#define LIBXSMM_X86_INSTR_VFNMADD231PH     0xf10566bc
#define LIBXSMM_X86_INSTR_VFMADD132SH      0xf0056999
#define LIBXSMM_X86_INSTR_VFMADD213SH      0xf00569a9
#define LIBXSMM_X86_INSTR_VFMADD231SH      0xf00569b9
#define LIBXSMM_X86_INSTR_VFNMADD132SH     0xf005699d
#define LIBXSMM_X86_INSTR_VFNMADD213SH     0xf00569ad
#define LIBXSMM_X86_INSTR_VFNMADD231SH     0xf00569bd
#define LIBXSMM_X86_INSTR_VFMSUB132PH      0xf105669a
#define LIBXSMM_X86_INSTR_VFMSUB213PH      0xf10566aa
#define LIBXSMM_X86_INSTR_VFMSUB231PH      0xf10566ba
#define LIBXSMM_X86_INSTR_VFNMSUB132PH     0xf105669e
#define LIBXSMM_X86_INSTR_VFNMSUB213PH     0xf10566ae
#define LIBXSMM_X86_INSTR_VFNMSUB231PH     0xf10566be
#define LIBXSMM_X86_INSTR_VFMSUB132SH      0xf005699b
#define LIBXSMM_X86_INSTR_VFMSUB213SH      0xf00569ab
#define LIBXSMM_X86_INSTR_VFMSUB231SH      0xf00569bb
#define LIBXSMM_X86_INSTR_VFNMSUB132SH     0xf005699f
#define LIBXSMM_X86_INSTR_VFNMSUB213SH     0xf00569af
#define LIBXSMM_X86_INSTR_VFNMSUB231SH     0xf00569bf
#define LIBXSMM_X86_INSTR_VPCLASSPH        0xe10c3666
#define LIBXSMM_X86_INSTR_VPCLASSSH        0xe00c3967
#define LIBXSMM_X86_INSTR_VGETEXPPH        0xe1056642
#define LIBXSMM_X86_INSTR_VGETEXPSH        0xf0056943
#define LIBXSMM_X86_INSTR_VGETMANTPH       0xe10c3626
#define LIBXSMM_X86_INSTR_VGETMANTSH       0xf00c3927
#define LIBXSMM_X86_INSTR_VMAXPH           0xf104565f
#define LIBXSMM_X86_INSTR_VMAXSH           0xf006595f
#define LIBXSMM_X86_INSTR_VMINPH           0xf104565d
#define LIBXSMM_X86_INSTR_VMINSH           0xf006595d
#define LIBXSMM_X86_INSTR_VMOVSH_LD_MEM    0xe0065910
#define LIBXSMM_X86_INSTR_VMOVSH_ST_MEM    0xe0065911
#define LIBXSMM_X86_INSTR_VMOVSH_LD_3REG   0xf0065910
#define LIBXSMM_X86_INSTR_VMOVSH_ST_3REG   0xf0065911
#define LIBXSMM_X86_INSTR_VMOVW_LD         0xe005596e
#define LIBXSMM_X86_INSTR_VMOVW_ST         0xe005597e
#define LIBXSMM_X86_INSTR_VMULPH           0xf1045659
#define LIBXSMM_X86_INSTR_VMULSH           0xf0065959
#define LIBXSMM_X86_INSTR_VRCPPH           0xe105664c
#define LIBXSMM_X86_INSTR_VRCPSH           0xf005694d
#define LIBXSMM_X86_INSTR_VREDUCEPH        0xe10c3656
#define LIBXSMM_X86_INSTR_VREDUCESH        0xf00c3957
#define LIBXSMM_X86_INSTR_VRNDSCALEPH      0xe10c3608
#define LIBXSMM_X86_INSTR_VRNDSCALESH      0xf00c390a
#define LIBXSMM_X86_INSTR_VRSQRTPH         0xe105664e
#define LIBXSMM_X86_INSTR_VRSQRTSH         0xf005694f
#define LIBXSMM_X86_INSTR_VSCALEFPH        0xf105662c
#define LIBXSMM_X86_INSTR_VSCALEFSH        0xf005692d
#define LIBXSMM_X86_INSTR_VSQRTPH          0xe1045651
#define LIBXSMM_X86_INSTR_VSQRTSH          0xf0065951
#define LIBXSMM_X86_INSTR_VSUBPH           0xf104565c
#define LIBXSMM_X86_INSTR_VSUBSH           0xf006595c
#define LIBXSMM_X86_INSTR_VCVTW2PH         0xe106567d

/* AVX512 Mask compute instructions */
#define LIBXSMM_X86_INSTR_KADDB            0xb005134a
#define LIBXSMM_X86_INSTR_KADDW            0xb004134a
#define LIBXSMM_X86_INSTR_KADDD            0xb085134a
#define LIBXSMM_X86_INSTR_KADDQ            0xb084134a
#define LIBXSMM_X86_INSTR_KANDB            0xb0051341
#define LIBXSMM_X86_INSTR_KANDW            0xb0041341
#define LIBXSMM_X86_INSTR_KANDD            0xb0851341
#define LIBXSMM_X86_INSTR_KANDQ            0xb0841341
#define LIBXSMM_X86_INSTR_KANDNB           0xb0051342
#define LIBXSMM_X86_INSTR_KANDNW           0xb0041342
#define LIBXSMM_X86_INSTR_KANDND           0xb0851342
#define LIBXSMM_X86_INSTR_KANDNQ           0xb0841342
#define LIBXSMM_X86_INSTR_KNOTB            0xa0051144
#define LIBXSMM_X86_INSTR_KNOTW            0xa0041144
#define LIBXSMM_X86_INSTR_KNOTD            0xa0851144
#define LIBXSMM_X86_INSTR_KNOTQ            0xa0841144
#define LIBXSMM_X86_INSTR_KORB             0xb0051345
#define LIBXSMM_X86_INSTR_KORW             0xb0041345
#define LIBXSMM_X86_INSTR_KORD             0xb0851345
#define LIBXSMM_X86_INSTR_KORQ             0xb0841345
#define LIBXSMM_X86_INSTR_KORTESTB         0xa0051198
#define LIBXSMM_X86_INSTR_KORTESTW         0xa0041198
#define LIBXSMM_X86_INSTR_KORTESTD         0xa0851198
#define LIBXSMM_X86_INSTR_KORTESTQ         0xa0841198
#define LIBXSMM_X86_INSTR_KSHIFTLB         0xa00d3132
#define LIBXSMM_X86_INSTR_KSHIFTLW         0xa08d3132
#define LIBXSMM_X86_INSTR_KSHIFTLD         0xa00d3133
#define LIBXSMM_X86_INSTR_KSHIFTLQ         0xa08d3133
#define LIBXSMM_X86_INSTR_KSHIFTRB         0xa00d3130
#define LIBXSMM_X86_INSTR_KSHIFTRW         0xa08d3130
#define LIBXSMM_X86_INSTR_KSHIFTRD         0xa00d3131
#define LIBXSMM_X86_INSTR_KSHIFTRQ         0xa08d3131
#define LIBXSMM_X86_INSTR_KTESTB           0xa0051199
#define LIBXSMM_X86_INSTR_KTESTW           0xa0041199
#define LIBXSMM_X86_INSTR_KTESTD           0xa0851199
#define LIBXSMM_X86_INSTR_KTESTQ           0xa0841199
#define LIBXSMM_X86_INSTR_KUNPCKBW         0xb005134b
#define LIBXSMM_X86_INSTR_KUNPCKWD         0xb004134b
#define LIBXSMM_X86_INSTR_KUNPCKDQ         0xb084134b
#define LIBXSMM_X86_INSTR_KXNORB           0xb0051346
#define LIBXSMM_X86_INSTR_KXNORW           0xb0041346
#define LIBXSMM_X86_INSTR_KXNORD           0xb0851346
#define LIBXSMM_X86_INSTR_KXNORQ           0xb0841346
#define LIBXSMM_X86_INSTR_KXORB            0xb0051347
#define LIBXSMM_X86_INSTR_KXORW            0xb0041347
#define LIBXSMM_X86_INSTR_KXORD            0xb0851347
#define LIBXSMM_X86_INSTR_KXORQ            0xb0841347

/* AVX512 Mask mov instructions */
#define LIBXSMM_X86_INSTR_KMOVB_GPR_LD     0xa0051192
#define LIBXSMM_X86_INSTR_KMOVW_GPR_LD     0xa0041192
#define LIBXSMM_X86_INSTR_KMOVD_GPR_LD     0xa0071192
#define LIBXSMM_X86_INSTR_KMOVQ_GPR_LD     0xa0871192
#define LIBXSMM_X86_INSTR_KMOVB_GPR_ST     0xa8051193
#define LIBXSMM_X86_INSTR_KMOVW_GPR_ST     0xa8041193
#define LIBXSMM_X86_INSTR_KMOVD_GPR_ST     0xa8071193
#define LIBXSMM_X86_INSTR_KMOVQ_GPR_ST     0xa8871193
#define LIBXSMM_X86_INSTR_KMOVB_LD         0xa0051190
#define LIBXSMM_X86_INSTR_KMOVW_LD         0xa0041190
#define LIBXSMM_X86_INSTR_KMOVD_LD         0xa0851190
#define LIBXSMM_X86_INSTR_KMOVQ_LD         0xa0841190
#define LIBXSMM_X86_INSTR_KMOVB_ST         0xa0051191
#define LIBXSMM_X86_INSTR_KMOVW_ST         0xa0041191
#define LIBXSMM_X86_INSTR_KMOVD_ST         0xa0851191
#define LIBXSMM_X86_INSTR_KMOVQ_ST         0xa0841191

/* AVX2 low precision convert instructions, AVX-NE-CONVERT */
#define LIBXSMM_X86_INSTR_VBCSTNEBF162PS   0x600620b1
#define LIBXSMM_X86_INSTR_VBCSTNESH2PS     0x600520b1
#define LIBXSMM_X86_INSTR_VCVTNEEBF162PS   0x600620b0
#define LIBXSMM_X86_INSTR_VCVTNEEPH2PS     0x600520b0
#define LIBXSMM_X86_INSTR_VCVTNEOBF162PS   0x600720b0
#define LIBXSMM_X86_INSTR_VCVTNEOPH2PS     0x600420b0
/* #define LIBXSMM_X86_INSTR_VCVTNEPS2BF16 is not needed as the encoding overlaps with EVEX */

/* AVX2 Int8 VNNI will all sign combinations, AVX-VNNI-INT8 */
#define LIBXSMM_X86_INSTR_VPDPBSUD         0x70062050
#define LIBXSMM_X86_INSTR_VPDPBSUDS        0x70062051
#define LIBXSMM_X86_INSTR_VPDPBSSD         0x70072050
#define LIBXSMM_X86_INSTR_VPDPBSSDS        0x70072051
#define LIBXSMM_X86_INSTR_VPDPBUUD         0x70042050
#define LIBXSMM_X86_INSTR_VPDPBUUDS        0x70042051

/* SSE1 instructions */
#define LIBXSMM_X86_INSTR_MOVAPS           0xa0041028
#define LIBXSMM_X86_INSTR_MOVAPS_LD        0xa0041028
#define LIBXSMM_X86_INSTR_MOVAPS_ST        0xa8041029
#define LIBXSMM_X86_INSTR_MOVUPS           0xa0041010
#define LIBXSMM_X86_INSTR_MOVUPS_LD        0xa0041010
#define LIBXSMM_X86_INSTR_MOVUPS_ST        0xa8041011
#define LIBXSMM_X86_INSTR_MOVLPS           0xa0041012
#define LIBXSMM_X86_INSTR_MOVHPS           0xa0041016
#define LIBXSMM_X86_INSTR_MOVLHPS          0xa0041016
#define LIBXSMM_X86_INSTR_MOVHLPS          0xa0041012
#define LIBXSMM_X86_INSTR_MOVMSKPS         0xa0041050
#define LIBXSMM_X86_INSTR_MOVNTPS          0xa004102b
#define LIBXSMM_X86_INSTR_ANDPS            0xa0041054
#define LIBXSMM_X86_INSTR_ANDNPS           0xa0041055
#define LIBXSMM_X86_INSTR_ORPS             0xa0041056
#define LIBXSMM_X86_INSTR_XORPS            0xa0041057
#define LIBXSMM_X86_INSTR_MULPS            0xa0041059
#define LIBXSMM_X86_INSTR_ADDPS            0xa0041058
#define LIBXSMM_X86_INSTR_SUBPS            0xa004105c
#define LIBXSMM_X86_INSTR_DIVPS            0xa004105e
#define LIBXSMM_X86_INSTR_RCPPS            0xa0041053
#define LIBXSMM_X86_INSTR_SQRTPS           0xa0041051
#define LIBXSMM_X86_INSTR_MAXPS            0xa004105f
#define LIBXSMM_X86_INSTR_MINPS            0xa004105d
#define LIBXSMM_X86_INSTR_RSQRTPS          0xa0041052
#define LIBXSMM_X86_INSTR_CMPPS            0xa00c10c2
#define LIBXSMM_X86_INSTR_SHUFPS           0xa00c10c6
#define LIBXSMM_X86_INSTR_UNPCKHPS         0xa0041015
#define LIBXSMM_X86_INSTR_UNPCKLPS         0xa0041014
#define LIBXSMM_X86_INSTR_MOVSS            0xa0061010
#define LIBXSMM_X86_INSTR_MOVSS_LD         0xa0061010
#define LIBXSMM_X86_INSTR_MOVSS_ST         0xa8061011
#define LIBXSMM_X86_INSTR_MULSS            0xa0061059
#define LIBXSMM_X86_INSTR_ADDSS            0xa0061058
#define LIBXSMM_X86_INSTR_SUBSS            0xa006105c
#define LIBXSMM_X86_INSTR_DIVSS            0xa006105e
#define LIBXSMM_X86_INSTR_RCPSS            0xa0061053
#define LIBXSMM_X86_INSTR_SQRTSS           0xa0061051
#define LIBXSMM_X86_INSTR_MAXSS            0xa006105f
#define LIBXSMM_X86_INSTR_MINSS            0xa006105d
#define LIBXSMM_X86_INSTR_RSQRTSS          0xa0061052
#define LIBXSMM_X86_INSTR_CMPSS            0xa00e10c2
#define LIBXSMM_X86_INSTR_COMISS           0xa004102f
#define LIBXSMM_X86_INSTR_UCOMISS          0xa004102e

/* SSE2 instructions */
#define LIBXSMM_X86_INSTR_MOVD_SSE_LD      0xa005106e
#define LIBXSMM_X86_INSTR_MOVD_SSE_ST      0xa005107e
#define LIBXSMM_X86_INSTR_MOVQ_SSE_LD      0xa285106e
#define LIBXSMM_X86_INSTR_MOVQ_SSE_ST      0xa285107e
#define LIBXSMM_X86_INSTR_MOVAPD           0xa0051028
#define LIBXSMM_X86_INSTR_MOVAPD_LD        0xa0051028
#define LIBXSMM_X86_INSTR_MOVAPD_ST        0xa8051029
#define LIBXSMM_X86_INSTR_MOVUPD           0xa0051010
#define LIBXSMM_X86_INSTR_MOVUPD_LD        0xa0051010
#define LIBXSMM_X86_INSTR_MOVUPD_ST        0xa8051011
#define LIBXSMM_X86_INSTR_MOVLPD           0xa0051012
#define LIBXSMM_X86_INSTR_MOVHPD           0xa0051016
#define LIBXSMM_X86_INSTR_MOVLHPD          0xa0051016
#define LIBXSMM_X86_INSTR_MOVHLPD          0xa0051012
#define LIBXSMM_X86_INSTR_MOVMSKPD         0xa0051050
#define LIBXSMM_X86_INSTR_MOVNTPD          0xa005102b
#define LIBXSMM_X86_INSTR_ANDPD            0xa0051054
#define LIBXSMM_X86_INSTR_ANDNPD           0xa0051055
#define LIBXSMM_X86_INSTR_ORPD             0xa0051056
#define LIBXSMM_X86_INSTR_XORPD            0xa0051057
#define LIBXSMM_X86_INSTR_MULPD            0xa0051059
#define LIBXSMM_X86_INSTR_ADDPD            0xa0051058
#define LIBXSMM_X86_INSTR_SUBPD            0xa005105c
#define LIBXSMM_X86_INSTR_DIVPD            0xa005105e
#define LIBXSMM_X86_INSTR_RCPPD            0xa0051053
#define LIBXSMM_X86_INSTR_SQRTPD           0xa0051051
#define LIBXSMM_X86_INSTR_MAXPD            0xa005105f
#define LIBXSMM_X86_INSTR_MINPD            0xa005105d
#define LIBXSMM_X86_INSTR_RSQRTPD          0xa0051052
#define LIBXSMM_X86_INSTR_CMPPD            0xa00d10c2
#define LIBXSMM_X86_INSTR_SHUFPD           0xa00d10c6
#define LIBXSMM_X86_INSTR_UNPCKHPD         0xa0051015
#define LIBXSMM_X86_INSTR_UNPCKLPD         0xa0051014
#define LIBXSMM_X86_INSTR_MOVSD            0xa0071010
#define LIBXSMM_X86_INSTR_MOVSD_LD         0xa0071010
#define LIBXSMM_X86_INSTR_MOVSD_ST         0xa8071011
#define LIBXSMM_X86_INSTR_MULSD            0xa0071059
#define LIBXSMM_X86_INSTR_ADDSD            0xa0071058
#define LIBXSMM_X86_INSTR_SUBSD            0xa007105c
#define LIBXSMM_X86_INSTR_DIVSD            0xa007105e
#define LIBXSMM_X86_INSTR_RCPSD            0xa0071053
#define LIBXSMM_X86_INSTR_SQRTSD           0xa0071051
#define LIBXSMM_X86_INSTR_MAXSD            0xa007105f
#define LIBXSMM_X86_INSTR_MINSD            0xa007105d
#define LIBXSMM_X86_INSTR_RSQRTSD          0xa0071052
#define LIBXSMM_X86_INSTR_CMPSD            0xa00f10c2
#define LIBXSMM_X86_INSTR_COMISD           0xa005102f
#define LIBXSMM_X86_INSTR_UCOMISD          0xa005102e
#define LIBXSMM_X86_INSTR_MOVDQA_LD        0xa005106f
#define LIBXSMM_X86_INSTR_MOVDQA_ST        0xa805107f
#define LIBXSMM_X86_INSTR_MOVDQU_LD        0xa006106f
#define LIBXSMM_X86_INSTR_MOVDQU_ST        0xa806107f
#define LIBXSMM_X86_INSTR_MOVNTDQ          0xa00510e7
#define LIBXSMM_X86_INSTR_PAND             0xa00510db
#define LIBXSMM_X86_INSTR_PANDN            0xa00510df
#define LIBXSMM_X86_INSTR_POR              0xa00510eb
#define LIBXSMM_X86_INSTR_PXOR             0xa00510ef
#define LIBXSMM_X86_INSTR_PACKSSWB         0xa0051063
#define LIBXSMM_X86_INSTR_PACKSSDW         0xa005106b
#define LIBXSMM_X86_INSTR_PACKUSWB         0xa0051067
#define LIBXSMM_X86_INSTR_PADDB            0xa00510fc
#define LIBXSMM_X86_INSTR_PADDW            0xa00510fd
#define LIBXSMM_X86_INSTR_PADDD            0xa00510fe
#define LIBXSMM_X86_INSTR_PADDQ            0xa00510d4
#define LIBXSMM_X86_INSTR_PADDSB           0xa00510ec
#define LIBXSMM_X86_INSTR_PADDSW           0xa00510ed
#define LIBXSMM_X86_INSTR_PADDUSB          0xa00510dc
#define LIBXSMM_X86_INSTR_PADDUSW          0xa00510dd
#define LIBXSMM_X86_INSTR_PAVGB            0xa00510e0
#define LIBXSMM_X86_INSTR_PAVGW            0xa00510e3
#define LIBXSMM_X86_INSTR_PCMPEQB          0xa0051074
#define LIBXSMM_X86_INSTR_PCMPEQW          0xa0051075
#define LIBXSMM_X86_INSTR_PCMPEQD          0xa0051076
#define LIBXSMM_X86_INSTR_PCMPGTB          0xa0051064
#define LIBXSMM_X86_INSTR_PCMPGTW          0xa0051065
#define LIBXSMM_X86_INSTR_PCMPGTD          0xa0051066
#define LIBXSMM_X86_INSTR_PEXTRW           0xa00d10c5
#define LIBXSMM_X86_INSTR_PINSRW           0xa00d10c4
#define LIBXSMM_X86_INSTR_PMADDWD          0xa00510f5
#define LIBXSMM_X86_INSTR_PMAXSW           0xa00510ee
#define LIBXSMM_X86_INSTR_PMAXUB           0xa00510de
#define LIBXSMM_X86_INSTR_PMINSW           0xa00510ea
#define LIBXSMM_X86_INSTR_PMINUB           0xa00510da
#define LIBXSMM_X86_INSTR_PMOVMSKB         0xa00510d7
#define LIBXSMM_X86_INSTR_PMULHUW          0xa00510e4
#define LIBXSMM_X86_INSTR_PMULHW           0xa00510e5
#define LIBXSMM_X86_INSTR_PMULLW           0xa00510d5
#define LIBXSMM_X86_INSTR_PMULUDQ          0xa00510f4
#define LIBXSMM_X86_INSTR_PSADBW           0xa00510f6
#define LIBXSMM_X86_INSTR_PSHUFD           0xa00d1070
#define LIBXSMM_X86_INSTR_PSHUFHW          0xa00e1070
#define LIBXSMM_X86_INSTR_PSHUFLW          0xa00f1070
#define LIBXSMM_X86_INSTR_PSLLW            0xa00510f1
#define LIBXSMM_X86_INSTR_PSLLW_I          0x946d1071
#define LIBXSMM_X86_INSTR_PSLLD            0xa00510f2
#define LIBXSMM_X86_INSTR_PSLLD_I          0x946d1072
#define LIBXSMM_X86_INSTR_PSLLQ            0xa00510f3
#define LIBXSMM_X86_INSTR_PSLLQ_I          0x946d1073
#define LIBXSMM_X86_INSTR_PSLLDQ_I         0x947d1073
#define LIBXSMM_X86_INSTR_PSRAW            0xa00510e1
#define LIBXSMM_X86_INSTR_PSRAW_I          0x944d1071
#define LIBXSMM_X86_INSTR_PSRAD            0xa00510e2
#define LIBXSMM_X86_INSTR_PSRAD_I          0x944d1072
#define LIBXSMM_X86_INSTR_PSRLW            0xa00510d1
#define LIBXSMM_X86_INSTR_PSRLW_I          0x942d1071
#define LIBXSMM_X86_INSTR_PSRLD            0xa00510d2
#define LIBXSMM_X86_INSTR_PSRLD_I          0x942d1072
#define LIBXSMM_X86_INSTR_PSRLQ            0xa00510d3
#define LIBXSMM_X86_INSTR_PSRLQ_I          0x942d1073
#define LIBXSMM_X86_INSTR_PSRLDQ_I         0x943d1073
#define LIBXSMM_X86_INSTR_PSUBB            0xa00510f8
#define LIBXSMM_X86_INSTR_PSUBW            0xa00510f9
#define LIBXSMM_X86_INSTR_PSUBD            0xa00510fa
#define LIBXSMM_X86_INSTR_PSUBQ            0xa00510fb
#define LIBXSMM_X86_INSTR_PSUBSB           0xa00510e8
#define LIBXSMM_X86_INSTR_PSUBSW           0xa00510e9
#define LIBXSMM_X86_INSTR_PSUBUSB          0xa00510d8
#define LIBXSMM_X86_INSTR_PSUBUSW          0xa00510d9
#define LIBXSMM_X86_INSTR_PUNPCKHBW        0xa0051068
#define LIBXSMM_X86_INSTR_PUNPCKHWD        0xa0051069
#define LIBXSMM_X86_INSTR_PUNPCKHDQ        0xa005106a
#define LIBXSMM_X86_INSTR_PUNPCKHQDQ       0xa005106d
#define LIBXSMM_X86_INSTR_PUNPCKLBW        0xa0051060
#define LIBXSMM_X86_INSTR_PUNPCKLWD        0xa0051061
#define LIBXSMM_X86_INSTR_PUNPCKLDQ        0xa0051062
#define LIBXSMM_X86_INSTR_PUNPCKLQDQ       0xa005106c
#define LIBXSMM_X86_INSTR_CVTDQ2PD         0xa00610e6
#define LIBXSMM_X86_INSTR_CVTDQ2PS         0xa004105b
#define LIBXSMM_X86_INSTR_CVTPD2DQ         0xa00710e6
#define LIBXSMM_X86_INSTR_CVTPD2PS         0xa005105a
#define LIBXSMM_X86_INSTR_CVTPS2DQ         0xa005105b
#define LIBXSMM_X86_INSTR_CVTPS2PD         0xa004105a
#define LIBXSMM_X86_INSTR_CVTSD2SS         0xa007105a
#define LIBXSMM_X86_INSTR_CVTSS2SD         0xa006105a
#define LIBXSMM_X86_INSTR_CVTTPD2DQ        0xa00510e6
#define LIBXSMM_X86_INSTR_CVTTPS2DQ        0xa006105b

/* SSE3 instructions */
#define LIBXSMM_X86_INSTR_ADDSUBPD         0xa00510d0
#define LIBXSMM_X86_INSTR_ADDSUBPS         0xa00710d0
#define LIBXSMM_X86_INSTR_HADDPD           0xa005107c
#define LIBXSMM_X86_INSTR_HADDPS           0xa007107c
#define LIBXSMM_X86_INSTR_HSUBPD           0xa005107d
#define LIBXSMM_X86_INSTR_HSUBPS           0xa007107d
#define LIBXSMM_X86_INSTR_LDDQU            0xa00710f0
#define LIBXSMM_X86_INSTR_MOVDDUP          0xa0071012
#define LIBXSMM_X86_INSTR_MOVSHDUP         0xa0061016
#define LIBXSMM_X86_INSTR_MOVSLDUP         0xa0061012

/* SSSE3 instructions */
#define LIBXSMM_X86_INSTR_PABSB            0xa005201c
#define LIBXSMM_X86_INSTR_PABSW            0xa005201d
#define LIBXSMM_X86_INSTR_PABSD            0xa005201e
#define LIBXSMM_X86_INSTR_PALIGNR          0xa00d300f
#define LIBXSMM_X86_INSTR_PHADDW           0xa0052001
#define LIBXSMM_X86_INSTR_PHADDD           0xa0052002
#define LIBXSMM_X86_INSTR_PHADDSW          0xa0052003
#define LIBXSMM_X86_INSTR_PHSUBW           0xa0052005
#define LIBXSMM_X86_INSTR_PHSUBD           0xa0052006
#define LIBXSMM_X86_INSTR_PHSUBSW          0xa0052007
#define LIBXSMM_X86_INSTR_PMADDUBSW        0xa0052004
#define LIBXSMM_X86_INSTR_PMULHRSW         0xa005200b
#define LIBXSMM_X86_INSTR_PSHUFB           0xa0052000
#define LIBXSMM_X86_INSTR_PSIGNB           0xa0052008
#define LIBXSMM_X86_INSTR_PSIGNW           0xa0052009
#define LIBXSMM_X86_INSTR_PSIGND           0xa005200a

/* SSE4.1 instructions */
#define LIBXSMM_X86_INSTR_BLENDPD          0xa00d300d
#define LIBXSMM_X86_INSTR_BLENDPS          0xa00d300c
#define LIBXSMM_X86_INSTR_BLENDVPD         0xa0052015
#define LIBXSMM_X86_INSTR_BLENDVPS         0xa0052014
#define LIBXSMM_X86_INSTR_DPPD             0xa00d3041
#define LIBXSMM_X86_INSTR_DPPS             0xa00d3040
#define LIBXSMM_X86_INSTR_EXTRACTPS        0xa00d3017
#define LIBXSMM_X86_INSTR_INSERTPS         0xa00d3021
#define LIBXSMM_X86_INSTR_ROUNDPD          0xa00d3009
#define LIBXSMM_X86_INSTR_ROUNDPS          0xa00d3008
#define LIBXSMM_X86_INSTR_ROUNDSD          0xa00d300b
#define LIBXSMM_X86_INSTR_ROUNDSS          0xa00d300a
#define LIBXSMM_X86_INSTR_MOVNTDQA         0xa005202a
#define LIBXSMM_X86_INSTR_PBLENDW          0xa00d300e
#define LIBXSMM_X86_INSTR_PBLENDVB         0xa0052010
#define LIBXSMM_X86_INSTR_PCMPEQQ          0xa0052029
#define LIBXSMM_X86_INSTR_PMOVSXBW         0xa0052020
#define LIBXSMM_X86_INSTR_PMOVSXBD         0xa0052021
#define LIBXSMM_X86_INSTR_PMOVSXBQ         0xa0052022
#define LIBXSMM_X86_INSTR_PMOVSXWD         0xa0052023
#define LIBXSMM_X86_INSTR_PMOVSXWQ         0xa0052024
#define LIBXSMM_X86_INSTR_PMOVSXDQ         0xa0052025
#define LIBXSMM_X86_INSTR_PMOVZXBW         0xa0052030
#define LIBXSMM_X86_INSTR_PMOVZXBD         0xa0052031
#define LIBXSMM_X86_INSTR_PMOVZXBQ         0xa0052032
#define LIBXSMM_X86_INSTR_PMOVZXWD         0xa0052033
#define LIBXSMM_X86_INSTR_PMOVZXWQ         0xa0052034
#define LIBXSMM_X86_INSTR_PMOVZXDQ         0xa0052035
#define LIBXSMM_X86_INSTR_PEXTRB           0xa80d3014
#define LIBXSMM_X86_INSTR_PEXTRD           0xa80d3016
#define LIBXSMM_X86_INSTR_PEXTRQ           0xaa8d3016
#define LIBXSMM_X86_INSTR_PHMINPOSUW       0xa0052041
#define LIBXSMM_X86_INSTR_PINSRB           0xa00d3020
#define LIBXSMM_X86_INSTR_PINSRD           0xa00d3022
#define LIBXSMM_X86_INSTR_PINSRQ           0xa28d3022
#define LIBXSMM_X86_INSTR_PMAXSB           0xa005203c
#define LIBXSMM_X86_INSTR_PMAXSD           0xa005203d
#define LIBXSMM_X86_INSTR_PMAXUW           0xa005203e
#define LIBXSMM_X86_INSTR_PMAXUD           0xa005203f
#define LIBXSMM_X86_INSTR_PMINSB           0xa0052038
#define LIBXSMM_X86_INSTR_PMINSD           0xa0052039
#define LIBXSMM_X86_INSTR_PMINUW           0xa005203a
#define LIBXSMM_X86_INSTR_PMINUD           0xa005203b
#define LIBXSMM_X86_INSTR_MPSADBW          0xa00d3042
#define LIBXSMM_X86_INSTR_PMULDQ           0xa0052028
#define LIBXSMM_X86_INSTR_PMULLD           0xa0052040
#define LIBXSMM_X86_INSTR_PACKUSDW         0xa005202b
#define LIBXSMM_X86_INSTR_PTEST            0xa0052017

/* SSE4.2 instructions */
#define LIBXSMM_X86_INSTR_PCMPGTQ          0xa0052037

/* IA GP instructions */
#define LIBXSMM_X86_INSTR_ADDQ             30000
#define LIBXSMM_X86_INSTR_ADDB_RM_IMM8     0x940c0080
#define LIBXSMM_X86_INSTR_ADDW_RM_IMM16    0x940c4181
#define LIBXSMM_X86_INSTR_ADDD_RM_IMM32    0x940c0281
#define LIBXSMM_X86_INSTR_ADDQ_RM_IMM32    0x968c0281
#define LIBXSMM_X86_INSTR_ADDB_RM_R        0xa8040000
#define LIBXSMM_X86_INSTR_ADDW_RM_R        0xa8044001
#define LIBXSMM_X86_INSTR_ADDD_RM_R        0xa8040001
#define LIBXSMM_X86_INSTR_ADDQ_RM_R        0xaa840001
#define LIBXSMM_X86_INSTR_ADDB_R_RM        0xa0040002
#define LIBXSMM_X86_INSTR_ADDW_R_RM        0xa0044003
#define LIBXSMM_X86_INSTR_ADDD_R_RM        0xa0040003
#define LIBXSMM_X86_INSTR_ADDQ_R_RM        0xa2840003
#define LIBXSMM_X86_INSTR_ANDQ             30001
#define LIBXSMM_X86_INSTR_ANDB_RM_IMM8     0x944c0080
#define LIBXSMM_X86_INSTR_ANDW_RM_IMM16    0x944c4181
#define LIBXSMM_X86_INSTR_ANDD_RM_IMM32    0x944c0281
#define LIBXSMM_X86_INSTR_ANDQ_RM_IMM32    0x96cc0281
#define LIBXSMM_X86_INSTR_ANDB_RM_R        0xa8040020
#define LIBXSMM_X86_INSTR_ANDW_RM_R        0xa8044021
#define LIBXSMM_X86_INSTR_ANDD_RM_R        0xa8040021
#define LIBXSMM_X86_INSTR_ANDQ_RM_R        0xaa840021
#define LIBXSMM_X86_INSTR_ANDB_R_RM        0xa0040022
#define LIBXSMM_X86_INSTR_ANDW_R_RM        0xa0044023
#define LIBXSMM_X86_INSTR_ANDD_R_RM        0xa0040023
#define LIBXSMM_X86_INSTR_ANDQ_R_RM        0xa2840023
#define LIBXSMM_X86_INSTR_CLC              0x800400f8
#define LIBXSMM_X86_INSTR_CLDEMOTE         0x9404101c
#define LIBXSMM_X86_INSTR_CLFLUSH          0x947410ae
#define LIBXSMM_X86_INSTR_CLFLUSHOPT       0x947510ae
#define LIBXSMM_X86_INSTR_CMC              0x800400f5
#define LIBXSMM_X86_INSTR_CMOVAW           0xa0045047
#define LIBXSMM_X86_INSTR_CMOVAD           0xa0041047
#define LIBXSMM_X86_INSTR_CMOVAQ           0xa2841047
#define LIBXSMM_X86_INSTR_CMOVA            LIBXSMM_X86_INSTR_CMOVAQ
#define LIBXSMM_X86_INSTR_CMOVAEW          0xa0045043
#define LIBXSMM_X86_INSTR_CMOVAED          0xa0041043
#define LIBXSMM_X86_INSTR_CMOVAEQ          0xa2841043
#define LIBXSMM_X86_INSTR_CMOVAE           LIBXSMM_X86_INSTR_CMOVAEQ
#define LIBXSMM_X86_INSTR_CMOVBW           0xa0045042
#define LIBXSMM_X86_INSTR_CMOVBD           0xa0041042
#define LIBXSMM_X86_INSTR_CMOVBQ           0xa2841042
#define LIBXSMM_X86_INSTR_CMOVB            LIBXSMM_X86_INSTR_CMOVBQ
#define LIBXSMM_X86_INSTR_CMOVBEW          0xa0045046
#define LIBXSMM_X86_INSTR_CMOVBED          0xa0041046
#define LIBXSMM_X86_INSTR_CMOVBEQ          0xa2841046
#define LIBXSMM_X86_INSTR_CMOVBE           LIBXSMM_X86_INSTR_CMOVBEQ
#define LIBXSMM_X86_INSTR_CMOVCW           LIBXSMM_X86_INSTR_CMOVBW
#define LIBXSMM_X86_INSTR_CMOVCD           LIBXSMM_X86_INSTR_CMOVBD
#define LIBXSMM_X86_INSTR_CMOVCQ           LIBXSMM_X86_INSTR_CMOVBQ
#define LIBXSMM_X86_INSTR_CMOVC            LIBXSMM_X86_INSTR_CMOVCQ
#define LIBXSMM_X86_INSTR_CMOVEW           0xa0045044
#define LIBXSMM_X86_INSTR_CMOVED           0xa0041044
#define LIBXSMM_X86_INSTR_CMOVEQ           0xa2841044
#define LIBXSMM_X86_INSTR_CMOVE            LIBXSMM_X86_INSTR_CMOVEQ
#define LIBXSMM_X86_INSTR_CMOVGW           0xa004504f
#define LIBXSMM_X86_INSTR_CMOVGD           0xa004104f
#define LIBXSMM_X86_INSTR_CMOVGQ           0xa284104f
#define LIBXSMM_X86_INSTR_CMOVG            LIBXSMM_X86_INSTR_CMOVGQ
#define LIBXSMM_X86_INSTR_CMOVGEW          0xa004504d
#define LIBXSMM_X86_INSTR_CMOVGED          0xa004104d
#define LIBXSMM_X86_INSTR_CMOVGEQ          0xa284104d
#define LIBXSMM_X86_INSTR_CMOVGE           LIBXSMM_X86_INSTR_CMOVGEQ
#define LIBXSMM_X86_INSTR_CMOVLW           0xa004504c
#define LIBXSMM_X86_INSTR_CMOVLD           0xa004104c
#define LIBXSMM_X86_INSTR_CMOVLQ           0xa284104c
#define LIBXSMM_X86_INSTR_CMOVL            LIBXSMM_X86_INSTR_CMOVLQ
#define LIBXSMM_X86_INSTR_CMOVLEW          0xa004504e
#define LIBXSMM_X86_INSTR_CMOVLED          0xa004104e
#define LIBXSMM_X86_INSTR_CMOVLEQ          0xa284104e
#define LIBXSMM_X86_INSTR_CMOVLE           LIBXSMM_X86_INSTR_CMOVLEQ
#define LIBXSMM_X86_INSTR_CMOVNAW          LIBXSMM_X86_INSTR_CMOVBEW
#define LIBXSMM_X86_INSTR_CMOVNAD          LIBXSMM_X86_INSTR_CMOVBED
#define LIBXSMM_X86_INSTR_CMOVNAQ          LIBXSMM_X86_INSTR_CMOVBEQ
#define LIBXSMM_X86_INSTR_CMOVNA           LIBXSMM_X86_INSTR_CMOVNAQ
#define LIBXSMM_X86_INSTR_CMOVNAEW         LIBXSMM_X86_INSTR_CMOVBW
#define LIBXSMM_X86_INSTR_CMOVNAED         LIBXSMM_X86_INSTR_CMOVBD
#define LIBXSMM_X86_INSTR_CMOVNAEQ         LIBXSMM_X86_INSTR_CMOVBQ
#define LIBXSMM_X86_INSTR_CMOVNAE          LIBXSMM_X86_INSTR_CMOVNAEQ
#define LIBXSMM_X86_INSTR_CMOVNBW          LIBXSMM_X86_INSTR_CMOVAEW
#define LIBXSMM_X86_INSTR_CMOVNBD          LIBXSMM_X86_INSTR_CMOVAED
#define LIBXSMM_X86_INSTR_CMOVNBQ          LIBXSMM_X86_INSTR_CMOVAEQ
#define LIBXSMM_X86_INSTR_CMOVNB           LIBXSMM_X86_INSTR_CMOVNBQ
#define LIBXSMM_X86_INSTR_CMOVNBEW         LIBXSMM_X86_INSTR_CMOVAW
#define LIBXSMM_X86_INSTR_CMOVNBED         LIBXSMM_X86_INSTR_CMOVAD
#define LIBXSMM_X86_INSTR_CMOVNBEQ         LIBXSMM_X86_INSTR_CMOVAQ
#define LIBXSMM_X86_INSTR_CMOVNBE          LIBXSMM_X86_INSTR_CMOVNBEQ
#define LIBXSMM_X86_INSTR_CMOVNCW          LIBXSMM_X86_INSTR_CMOVAEW
#define LIBXSMM_X86_INSTR_CMOVNCD          LIBXSMM_X86_INSTR_CMOVAED
#define LIBXSMM_X86_INSTR_CMOVNCQ          LIBXSMM_X86_INSTR_CMOVAEQ
#define LIBXSMM_X86_INSTR_CMOVNC           LIBXSMM_X86_INSTR_CMOVNCQ
#define LIBXSMM_X86_INSTR_CMOVNEW          0xa0045045
#define LIBXSMM_X86_INSTR_CMOVNED          0xa0041045
#define LIBXSMM_X86_INSTR_CMOVNEQ          0xa2841045
#define LIBXSMM_X86_INSTR_CMOVNE           LIBXSMM_X86_INSTR_CMOVNEQ
#define LIBXSMM_X86_INSTR_CMOVNGW          LIBXSMM_X86_INSTR_CMOVLEW
#define LIBXSMM_X86_INSTR_CMOVNGD          LIBXSMM_X86_INSTR_CMOVLED
#define LIBXSMM_X86_INSTR_CMOVNGQ          LIBXSMM_X86_INSTR_CMOVLEQ
#define LIBXSMM_X86_INSTR_CMOVNG           LIBXSMM_X86_INSTR_CMOVNGQ
#define LIBXSMM_X86_INSTR_CMOVNGEW         LIBXSMM_X86_INSTR_CMOVLW
#define LIBXSMM_X86_INSTR_CMOVNGED         LIBXSMM_X86_INSTR_CMOVLD
#define LIBXSMM_X86_INSTR_CMOVNGEQ         LIBXSMM_X86_INSTR_CMOVLQ
#define LIBXSMM_X86_INSTR_CMOVNGE          LIBXSMM_X86_INSTR_CMOVNGEQ
#define LIBXSMM_X86_INSTR_CMOVNLW          LIBXSMM_X86_INSTR_CMOVGEW
#define LIBXSMM_X86_INSTR_CMOVNLD          LIBXSMM_X86_INSTR_CMOVGED
#define LIBXSMM_X86_INSTR_CMOVNLQ          LIBXSMM_X86_INSTR_CMOVGEQ
#define LIBXSMM_X86_INSTR_CMOVNL           LIBXSMM_X86_INSTR_CMOVNLQ
#define LIBXSMM_X86_INSTR_CMOVNLEW         LIBXSMM_X86_INSTR_CMOVGW
#define LIBXSMM_X86_INSTR_CMOVNLED         LIBXSMM_X86_INSTR_CMOVGD
#define LIBXSMM_X86_INSTR_CMOVNLEQ         LIBXSMM_X86_INSTR_CMOVGQ
#define LIBXSMM_X86_INSTR_CMOVNLE          LIBXSMM_X86_INSTR_CMOVNLEQ
#define LIBXSMM_X86_INSTR_CMOVNOW          0xa0045041
#define LIBXSMM_X86_INSTR_CMOVNOD          0xa0041041
#define LIBXSMM_X86_INSTR_CMOVNOQ          0xa2841041
#define LIBXSMM_X86_INSTR_CMOVNO           LIBXSMM_X86_INSTR_CMOVNOQ
#define LIBXSMM_X86_INSTR_CMOVNPW          0xa004504b
#define LIBXSMM_X86_INSTR_CMOVNPD          0xa004104b
#define LIBXSMM_X86_INSTR_CMOVNPQ          0xa284104b
#define LIBXSMM_X86_INSTR_CMOVNP           LIBXSMM_X86_INSTR_CMOVNPQ
#define LIBXSMM_X86_INSTR_CMOVNSW          0xa0045049
#define LIBXSMM_X86_INSTR_CMOVNSD          0xa0041049
#define LIBXSMM_X86_INSTR_CMOVNSQ          0xa2841049
#define LIBXSMM_X86_INSTR_CMOVNS           LIBXSMM_X86_INSTR_CMOVNSQ
#define LIBXSMM_X86_INSTR_CMOVNZW          LIBXSMM_X86_INSTR_CMOVNEW
#define LIBXSMM_X86_INSTR_CMOVNZD          LIBXSMM_X86_INSTR_CMOVNED
#define LIBXSMM_X86_INSTR_CMOVNZQ          LIBXSMM_X86_INSTR_CMOVNEQ
#define LIBXSMM_X86_INSTR_CMOVNZ           LIBXSMM_X86_INSTR_CMOVNZQ
#define LIBXSMM_X86_INSTR_CMOVOW           0xa0045040
#define LIBXSMM_X86_INSTR_CMOVOD           0xa0041040
#define LIBXSMM_X86_INSTR_CMOVOQ           0xa2841040
#define LIBXSMM_X86_INSTR_CMOVO            LIBXSMM_X86_INSTR_CMOVOQ
#define LIBXSMM_X86_INSTR_CMOVPW           0xa004504a
#define LIBXSMM_X86_INSTR_CMOVPD           0xa004104a
#define LIBXSMM_X86_INSTR_CMOVPQ           0xa284104a
#define LIBXSMM_X86_INSTR_CMOVP            LIBXSMM_X86_INSTR_CMOVPQ
#define LIBXSMM_X86_INSTR_CMOVPEW          LIBXSMM_X86_INSTR_CMOVPW
#define LIBXSMM_X86_INSTR_CMOVPED          LIBXSMM_X86_INSTR_CMOVPD
#define LIBXSMM_X86_INSTR_CMOVPEQ          LIBXSMM_X86_INSTR_CMOVPQ
#define LIBXSMM_X86_INSTR_CMOVPE           LIBXSMM_X86_INSTR_CMOVPEQ
#define LIBXSMM_X86_INSTR_CMOVPOW          LIBXSMM_X86_INSTR_CMOVNPW
#define LIBXSMM_X86_INSTR_CMOVPOD          LIBXSMM_X86_INSTR_CMOVNPD
#define LIBXSMM_X86_INSTR_CMOVPOQ          LIBXSMM_X86_INSTR_CMOVNPQ
#define LIBXSMM_X86_INSTR_CMOVPO           LIBXSMM_X86_INSTR_CMOVPOQ
#define LIBXSMM_X86_INSTR_CMOVSW           0xa0045048
#define LIBXSMM_X86_INSTR_CMOVSD           0xa0041048
#define LIBXSMM_X86_INSTR_CMOVSQ           0xa2841048
#define LIBXSMM_X86_INSTR_CMOVS            LIBXSMM_X86_INSTR_CMOVSQ
#define LIBXSMM_X86_INSTR_CMOVZW           LIBXSMM_X86_INSTR_CMOVEW
#define LIBXSMM_X86_INSTR_CMOVZD           LIBXSMM_X86_INSTR_CMOVED
#define LIBXSMM_X86_INSTR_CMOVZQ           LIBXSMM_X86_INSTR_CMOVEQ
#define LIBXSMM_X86_INSTR_CMOVZ            LIBXSMM_X86_INSTR_CMOVZQ
#define LIBXSMM_X86_INSTR_CMPQ             30002
#define LIBXSMM_X86_INSTR_CMPB_RM_IMM8     0x947c0080
#define LIBXSMM_X86_INSTR_CMPW_RM_IMM16    0x947c4181
#define LIBXSMM_X86_INSTR_CMPD_RM_IMM32    0x947c0281
#define LIBXSMM_X86_INSTR_CMPQ_RM_IMM32    0x96fc0281
#define LIBXSMM_X86_INSTR_CMPB_RM_R        0xa8040038
#define LIBXSMM_X86_INSTR_CMPW_RM_R        0xa8044039
#define LIBXSMM_X86_INSTR_CMPD_RM_R        0xa8040039
#define LIBXSMM_X86_INSTR_CMPQ_RM_R        0xaa840039
#define LIBXSMM_X86_INSTR_CMPB_R_RM        0xa004003a
#define LIBXSMM_X86_INSTR_CMPW_R_RM        0xa004403b
#define LIBXSMM_X86_INSTR_CMPD_R_RM        0xa004003b
#define LIBXSMM_X86_INSTR_CMPQ_R_RM        0xa284003b
#define LIBXSMM_X86_INSTR_IMUL             30003
#define LIBXSMM_X86_INSTR_IDIVW            0x9c7440f7
#define LIBXSMM_X86_INSTR_IDIVD            0x9c7400f7
#define LIBXSMM_X86_INSTR_IDIVQ            0x9e7400f7
#define LIBXSMM_X86_INSTR_IMULW            0xa00450af
#define LIBXSMM_X86_INSTR_IMULD            0xa00410af
#define LIBXSMM_X86_INSTR_IMULQ            0xa28410af
#define LIBXSMM_X86_INSTR_IMULW_IMM16      0xa00c4169
#define LIBXSMM_X86_INSTR_IMULD_IMM32      0xa00c0269
#define LIBXSMM_X86_INSTR_IMULQ_IMM32      0xa28c0269
#define LIBXSMM_X86_INSTR_LEAW             0xa004408d
#define LIBXSMM_X86_INSTR_LEAD             0xa004008d
#define LIBXSMM_X86_INSTR_LEAQ             0xa284008d
#define LIBXSMM_X86_INSTR_LZCNTW           0xa00650bd
#define LIBXSMM_X86_INSTR_LZCNTD           0xa00610bd
#define LIBXSMM_X86_INSTR_LZCNTQ           0xa28610bd
#define LIBXSMM_X86_INSTR_MOVB             0xa004008a
#define LIBXSMM_X86_INSTR_MOVW             0xa004408b
#define LIBXSMM_X86_INSTR_MOVD             0xa004008b
#define LIBXSMM_X86_INSTR_MOVQ             0xa284008b
#define LIBXSMM_X86_INSTR_MOVB_LD          0xa004008a
#define LIBXSMM_X86_INSTR_MOVW_LD          0xa004408b
#define LIBXSMM_X86_INSTR_MOVD_LD          0xa004008b
#define LIBXSMM_X86_INSTR_MOVQ_LD          0xa284008b
#define LIBXSMM_X86_INSTR_MOVB_ST          0xa8040088
#define LIBXSMM_X86_INSTR_MOVW_ST          0xa8044089
#define LIBXSMM_X86_INSTR_MOVD_ST          0xa8040089
#define LIBXSMM_X86_INSTR_MOVQ_ST          0xaa840089
#define LIBXSMM_X86_INSTR_MOVB_RM_IMM8     0x940c00c6
#define LIBXSMM_X86_INSTR_MOVW_RM_IMM16    0x940c41c7
#define LIBXSMM_X86_INSTR_MOVD_RM_IMM32    0x940c02c7
#define LIBXSMM_X86_INSTR_MOVQ_RM_IMM32    0x968c02c7
#define LIBXSMM_X86_INSTR_MOVB_R_IMM8      0x950c00b0
#define LIBXSMM_X86_INSTR_MOVW_R_IMM16     0x950c41b8
#define LIBXSMM_X86_INSTR_MOVD_R_IMM32     0x950c02b8
#define LIBXSMM_X86_INSTR_MOVQ_R_IMM64     0x978c03b8
#define LIBXSMM_X86_INSTR_NEGB             0x9c3400f6
#define LIBXSMM_X86_INSTR_NEGW             0x9c3440f7
#define LIBXSMM_X86_INSTR_NEGD             0x9c3400f7
#define LIBXSMM_X86_INSTR_NEGQ             0x9eb400f7
#define LIBXSMM_X86_INSTR_NOTB             0x9c2400f6
#define LIBXSMM_X86_INSTR_NOTW             0x9c2440f7
#define LIBXSMM_X86_INSTR_NOTD             0x9c2400f7
#define LIBXSMM_X86_INSTR_NOTQ             0x9ea400f7
#define LIBXSMM_X86_INSTR_ORB_RM_IMM8      0x941c0080
#define LIBXSMM_X86_INSTR_ORW_RM_IMM16     0x941c4181
#define LIBXSMM_X86_INSTR_ORD_RM_IMM32     0x941c0281
#define LIBXSMM_X86_INSTR_ORQ_RM_IMM32     0x969c0281
#define LIBXSMM_X86_INSTR_ORB_RM_R         0xa8040008
#define LIBXSMM_X86_INSTR_ORW_RM_R         0xa8044009
#define LIBXSMM_X86_INSTR_ORD_RM_R         0xa8040009
#define LIBXSMM_X86_INSTR_ORQ_RM_R         0xaa840009
#define LIBXSMM_X86_INSTR_ORB_R_RM         0xa004000a
#define LIBXSMM_X86_INSTR_ORW_R_RM         0xa004400b
#define LIBXSMM_X86_INSTR_ORD_R_RM         0xa004000b
#define LIBXSMM_X86_INSTR_ORQ_R_RM         0xa284000b
#define LIBXSMM_X86_INSTR_POPW             0x91044058
#define LIBXSMM_X86_INSTR_POPQ             0x91040058
#define LIBXSMM_X86_INSTR_POPW_RM          0x9c04408f
#define LIBXSMM_X86_INSTR_POPQ_RM          0x9c04008f
#define LIBXSMM_X86_INSTR_POPCNT           30004
#define LIBXSMM_X86_INSTR_POPCNTW          0xa00650b8
#define LIBXSMM_X86_INSTR_POPCNTD          0xa00610b8
#define LIBXSMM_X86_INSTR_POPCNTQ          0xa28610b8
#define LIBXSMM_X86_INSTR_PREFETCHT0       0x94141018
#define LIBXSMM_X86_INSTR_PREFETCHT1       0x94241018
#define LIBXSMM_X86_INSTR_PREFETCHT2       0x94341018
#define LIBXSMM_X86_INSTR_PREFETCHNTA      0x94041018
#define LIBXSMM_X86_INSTR_PREFETCHW        0x9414100d
#define LIBXSMM_X86_INSTR_PUSHW            0x91044050
#define LIBXSMM_X86_INSTR_PUSHQ            0x91040050
#define LIBXSMM_X86_INSTR_PUSHW_RM         0x9c6440ff
#define LIBXSMM_X86_INSTR_PUSHQ_RM         0x9c6400ff
#define LIBXSMM_X86_INSTR_SALQ             30005
#define LIBXSMM_X86_INSTR_SALB_RM_IMM8     0x944c00c0
#define LIBXSMM_X86_INSTR_SALW_RM_IMM8     0x944c40c1
#define LIBXSMM_X86_INSTR_SALD_RM_IMM8     0x944c00c1
#define LIBXSMM_X86_INSTR_SALQ_RM_IMM8     0x96cc00c1
#define LIBXSMM_X86_INSTR_SARQ             30006
#define LIBXSMM_X86_INSTR_SARB_RM_IMM8     0x947c00c0
#define LIBXSMM_X86_INSTR_SARW_RM_IMM8     0x947c40c1
#define LIBXSMM_X86_INSTR_SARD_RM_IMM8     0x947c00c1
#define LIBXSMM_X86_INSTR_SARQ_RM_IMM8     0x96fc00c1
#define LIBXSMM_X86_INSTR_SHLQ             LIBXSMM_X86_INSTR_SALQ
#define LIBXSMM_X86_INSTR_SHLB_RM_IMM8     LIBXSMM_X86_INSTR_SALB_RM_IMM8
#define LIBXSMM_X86_INSTR_SHLW_RM_IMM8     LIBXSMM_X86_INSTR_SALW_RM_IMM8
#define LIBXSMM_X86_INSTR_SHLD_RM_IMM8     LIBXSMM_X86_INSTR_SALD_RM_IMM8
#define LIBXSMM_X86_INSTR_SHLQ_RM_IMM8     LIBXSMM_X86_INSTR_SALQ_RM_IMM8
#define LIBXSMM_X86_INSTR_SHRQ             30007
#define LIBXSMM_X86_INSTR_SHRB_RM_IMM8     0x945c00c0
#define LIBXSMM_X86_INSTR_SHRW_RM_IMM8     0x945c40c1
#define LIBXSMM_X86_INSTR_SHRD_RM_IMM8     0x945c00c1
#define LIBXSMM_X86_INSTR_SHRQ_RM_IMM8     0x96dc00c1
#define LIBXSMM_X86_INSTR_SUBQ             30008
#define LIBXSMM_X86_INSTR_SUBB_RM_IMM8     0x945c0080
#define LIBXSMM_X86_INSTR_SUBW_RM_IMM16    0x945c4181
#define LIBXSMM_X86_INSTR_SUBD_RM_IMM32    0x945c0281
#define LIBXSMM_X86_INSTR_SUBQ_RM_IMM32    0x96dc0281
#define LIBXSMM_X86_INSTR_SUBB_RM_R        0xa0040028
#define LIBXSMM_X86_INSTR_SUBW_RM_R        0xa0044029
#define LIBXSMM_X86_INSTR_SUBD_RM_R        0xa0040029
#define LIBXSMM_X86_INSTR_SUBQ_RM_R        0xa2840029
#define LIBXSMM_X86_INSTR_SUBB_R_RM        0xa004002a
#define LIBXSMM_X86_INSTR_SUBW_R_RM        0xa004402b
#define LIBXSMM_X86_INSTR_SUBD_R_RM        0xa004002b
#define LIBXSMM_X86_INSTR_SUBQ_R_RM        0xa284002b
#define LIBXSMM_X86_INSTR_TZCNT            30009
#define LIBXSMM_X86_INSTR_TZCNTW           0xa00650bc
#define LIBXSMM_X86_INSTR_TZCNTD           0xa00610bc
#define LIBXSMM_X86_INSTR_TZCNTQ           0xa28610bc
#define LIBXSMM_X86_INSTR_XORB_RM_IMM8     0x946c0080
#define LIBXSMM_X86_INSTR_XORW_RM_IMM16    0x946c4181
#define LIBXSMM_X86_INSTR_XORD_RM_IMM32    0x946c0281
#define LIBXSMM_X86_INSTR_XORQ_RM_IMM32    0x96ec0281
#define LIBXSMM_X86_INSTR_XORB_RM_R        0xa8040030
#define LIBXSMM_X86_INSTR_XORW_RM_R        0xa8044031
#define LIBXSMM_X86_INSTR_XORD_RM_R        0xa8040031
#define LIBXSMM_X86_INSTR_XORQ_RM_R        0xaa840031
#define LIBXSMM_X86_INSTR_XORB_R_RM        0xa0040032
#define LIBXSMM_X86_INSTR_XORW_R_RM        0xa0044033
#define LIBXSMM_X86_INSTR_XORD_R_RM        0xa0040033
#define LIBXSMM_X86_INSTR_XORQ_R_RM        0xa2840033

/* newer instrutions */
#define LIBXSMM_X86_INSTR_RDPID            0x9c7610c7

/* Jump instructions */
#define LIBXSMM_X86_INSTR_JL                 30100
#define LIBXSMM_X86_INSTR_JE                 30101
#define LIBXSMM_X86_INSTR_JZ                 30102
#define LIBXSMM_X86_INSTR_JG                 30103
#define LIBXSMM_X86_INSTR_JNE                30104
#define LIBXSMM_X86_INSTR_JNZ                30105
#define LIBXSMM_X86_INSTR_JGE                30106
#define LIBXSMM_X86_INSTR_JLE                30107
#define LIBXSMM_X86_INSTR_JMP                30108

/* Tile instructions */
/* CPUID: AMX-TILE INTERCEPT: SPR */
#define LIBXSMM_X86_INSTR_LDTILECFG          50001
#define LIBXSMM_X86_INSTR_STTILECFG          50002
#define LIBXSMM_X86_INSTR_TILERELEASE        50003
#define LIBXSMM_X86_INSTR_TILELOADD          0x6007204b
#define LIBXSMM_X86_INSTR_TILELOADDT1        0x6005204b
#define LIBXSMM_X86_INSTR_TILESTORED         0x6006204b
#define LIBXSMM_X86_INSTR_TILEZERO           0x50072049
/* CPUID: AMX-INT8 INTERCEPT: SPR */
#define LIBXSMM_X86_INSTR_TDPBSSD            0x7007205e
#define LIBXSMM_X86_INSTR_TDPBSUD            0x7006205e
#define LIBXSMM_X86_INSTR_TDPBUSD            0x7005205e
#define LIBXSMM_X86_INSTR_TDPBUUD            0x7004205e
/* CPUID: AMX-BF16 INTERCEPT: SPR */
#define LIBXSMM_X86_INSTR_TDPBF16PS          0x7006205c
/* CPUID: AMX-FP16 INTERCEPT: GNR */
#define LIBXSMM_X86_INSTR_TDPFP16PS          0x7007205c

/* define error codes */
#define LIBXSMM_ERR_GENERAL               90000
#define LIBXSMM_ERR_ALLOC                 90001
#define LIBXSMM_ERR_BUFFER_TOO_SMALL      90002
#define LIBXSMM_ERR_APPEND_STR            90003
#define LIBXSMM_ERR_ARCH_PREC             90004
#define LIBXSMM_ERR_ARCH                  90005
#define LIBXSMM_ERR_UNSUP_ARCH            90006
#define LIBXSMM_ERR_LDA                   90007
#define LIBXSMM_ERR_LDB                   90008
#define LIBXSMM_ERR_LDC                   90009
#define LIBXSMM_ERR_SPGEMM_GEN            90010
#define LIBXSMM_ERR_CSC_INPUT             90011
#define LIBXSMM_ERR_CSC_READ_LEN          90012
#define LIBXSMM_ERR_CSC_READ_DESC         90013
#define LIBXSMM_ERR_CSC_READ_ELEMS        90014
#define LIBXSMM_ERR_CSC_LEN               90015
#define LIBXSMM_ERR_N_BLOCK               90016
#define LIBXSMM_ERR_M_BLOCK               90017
#define LIBXSMM_ERR_K_BLOCK               90018
#define LIBXSMM_ERR_REG_BLOCK             90019
#define LIBXSMM_ERR_NO_AVX512_BCAST       90020
#define LIBXSMM_ERR_NO_AVX512_QFMA        90021
#define LIBXSMM_ERR_NO_INDEX_SCALE_ADDR   90022
#define LIBXSMM_ERR_UNSUPPORTED_JUMP      90023
#define LIBXSMM_ERR_NO_JMPLBL_AVAIL       90024
#define LIBXSMM_ERR_EXCEED_JMPLBL         90025
#define LIBXSMM_ERR_CSC_ALLOC_DATA        90026
#define LIBXSMM_ERR_CSR_ALLOC_DATA        90027
#define LIBXSMM_ERR_CSR_INPUT             90028
#define LIBXSMM_ERR_CSR_READ_LEN          90029
#define LIBXSMM_ERR_CSR_READ_DESC         90030
#define LIBXSMM_ERR_CSR_READ_ELEMS        90031
#define LIBXSMM_ERR_CSR_LEN               90032
#define LIBXSMM_ERR_UNSUP_DATATYPE        90033
#define LIBXSMM_ERR_UNSUP_DT_FORMAT       90034
#define LIBXSMM_ERR_INVALID_GEMM_CONFIG   90035
#define LIBXSMM_ERR_UNIQUE_VAL            90036
#define LIBXSMM_ERR_VEC_REG_MUST_BE_UNDEF 90037
#define LIBXSMM_ERR_JMPLBL_USED           90038
#define LIBXSMM_ERR_TRANS_B               90039
#define LIBXSMM_ERR_LDB_TRANS             90040
#define LIBXSMM_ERR_VNNI_A                90041
#define LIBXSMM_ERR_VNNI_B                90042
#define LIBXSMM_ERR_NO_AVX512VL           90043
#define LIBXSMM_ERR_GP_TEMP_MAPPING       90044
#define LIBXSMM_ERR_BITMASK_REQUIRED      90045
#define LIBXSMM_ERR_ILLEGAL_ABI           90046
#define LIBXSMM_ERR_UNKNOWN_OPERATION     90047
#define LIBXSMM_ERR_MISSING_REDUCE_FLAGS  90048
#define LIBXSMM_ERR_TRANS_A               90049
#define LIBXSMM_ERR_LDA_TRANS             90050
#define LIBXSMM_ERR_BRGEMM_TRANS          90051
#define LIBXSMM_ERR_ILLEGAL_REGNUM        90052
#define LIBXSMM_ERR_UNSUP_SIZE            90053
#define LIBXSMM_ERR_BCSC_BLOCK_SIZE       90054

#define LIBXSMM_HANDLE_ERROR(GENERATED_CODE, ERROR_CODE) libxsmm_handle_error( \
  GENERATED_CODE, ERROR_CODE, LIBXSMM_FUNCNAME, __FILE__, __LINE__, 1 < libxsmm_ninit ? libxsmm_verbosity : 1)
#define LIBXSMM_HANDLE_ERROR_VERBOSE(GENERATED_CODE, ERROR_CODE) libxsmm_handle_error( \
  GENERATED_CODE, ERROR_CODE, LIBXSMM_FUNCNAME, __FILE__, __LINE__, 1)

/* LIBXSMM_EXIT_ERROR(io_generated_code) instead of exit(-1) */
#if !defined(LIBXSMM_EXIT_HARD)
# define LIBXSMM_EXIT_ERROR(GENERATED_CODE) \
    LIBXSMM_HANDLE_ERROR(GENERATED_CODE, LIBXSMM_ERR_GENERAL)
#else /* hard exit */
# define LIBXSMM_EXIT_ERROR(GENERATED_CODE) exit(0xF)
#endif

#if defined(NDEBUG)
# define LIBXSMM_HANDLE_ERROR_OFF_BEGIN() libxsmm_set_handle_error(0)
# define LIBXSMM_HANDLE_ERROR_OFF_END() libxsmm_set_handle_error(1)
#else
# define LIBXSMM_HANDLE_ERROR_OFF_BEGIN()
# define LIBXSMM_HANDLE_ERROR_OFF_END()
#endif

/* tile config structure */
typedef struct libxsmm_tile_config {
  unsigned char  palette_id;
  unsigned short tile0rowsb;
  unsigned char  tile0cols;
  unsigned short tile1rowsb;
  unsigned char  tile1cols;
  unsigned short tile2rowsb;
  unsigned char  tile2cols;
  unsigned short tile3rowsb;
  unsigned char  tile3cols;
  unsigned short tile4rowsb;
  unsigned char  tile4cols;
  unsigned short tile5rowsb;
  unsigned char  tile5cols;
  unsigned short tile6rowsb;
  unsigned char  tile6cols;
  unsigned short tile7rowsb;
  unsigned char  tile7cols;
} libxsmm_tile_config;

/* structure for tracking local labels in assembly we do not allow overlapping loops */
LIBXSMM_EXTERN_C typedef struct libxsmm_loop_label_tracker_struct {
  unsigned int label_address[512];
  unsigned int label_count;
} libxsmm_loop_label_tracker;

/* structure to save jump properties to the same destination */
LIBXSMM_EXTERN_C typedef struct libxsmm_jump_source_struct {
  unsigned int instr_type[512];
  unsigned int instr_addr[512];
  unsigned int ref_count;
} libxsmm_jump_source;

/* structure for tracking arbitrary jump labels in assembly code */
LIBXSMM_EXTERN_C typedef struct libxsmm_jump_label_tracker_struct {
  unsigned int        label_address[512];
  libxsmm_jump_source label_source[512];
} libxsmm_jump_label_tracker;

LIBXSMM_EXTERN_C typedef struct libxsmm_const_data_tracker {
  unsigned char const_data[81920];
  unsigned int const_data_size;
  unsigned int const_data_pc_load_insns[128];
  unsigned int const_data_nload_insns;
} libxsmm_const_data_tracker;

LIBXSMM_EXTERN_C typedef struct libxsmm_blocking_info_t {
  unsigned int tiles;
  unsigned int sizes[4];
  unsigned int blocking;
  unsigned int block_size;
} libxsmm_blocking_info_t;

/* micro kernel configuration */
LIBXSMM_EXTERN_C typedef struct libxsmm_micro_kernel_config {
  unsigned int instruction_set;
  unsigned int vector_reg_count;
  unsigned int vector_length;
  unsigned int datatype_size_in;
  unsigned int datatype_size_in2;
  unsigned int datatype_size_out;
  unsigned int a_vmove_instruction;
  unsigned int b_vmove_instruction;
  unsigned int b_shuff_instruction;
  unsigned int c_vmove_instruction;
  unsigned int c_vmove_nts_instruction;
  unsigned int use_masking_a_c;
  unsigned int prefetch_instruction;
  unsigned int vxor_instruction;
  unsigned int vmul_instruction;
  unsigned int vadd_instruction;
  unsigned int alu_add_instruction;
  unsigned int alu_sub_instruction;
  unsigned int alu_cmp_instruction;
  unsigned int alu_jmp_instruction;
  unsigned int alu_mov_instruction;
  char vector_name;

  /* Auxiliary variables for GEMM fusion info */
  unsigned int fused_eltwise;
  unsigned int m_loop_exists;
  unsigned int n_loop_exists;
  unsigned int fused_bcolbias;
  unsigned int fused_hcolbias;
  unsigned int fused_b8colbias;
  unsigned int fused_h8colbias;
  unsigned int fused_scolbias;
  unsigned int fused_relu;
  unsigned int fused_relu_nobitmask;
  unsigned int fused_relu_bwd;
  unsigned int fused_sigmoid;
  unsigned int overwrite_C;
  unsigned int vnni_format_C;
  unsigned int sparsity_factor_A;
  unsigned int decompress_A;
  unsigned int vnni_cvt_output_ext_buf;
  unsigned int norm_to_normT_B_ext_buf;
  unsigned int has_colbias_act_fused;
  unsigned int current_m; /* this is a hack, it's for tracking in SSE relubit
                             mask fusion the logical M start as we only get 4
                             mask bits, but can only read and write at 8 bits
                             granularity */
  unsigned int m_bitmask_advance; /* this is a hack, it's for tracking in SSE relubit
                             mask fusion the logical M start as we only get 4
                             mask bits, but can only read and write at 8 bits
                             granularity */

  /* Register names/logistics for fusion boo-keeping */
  unsigned int reserved_zmms;
  unsigned int reserved_mask_regs;
  unsigned int vnni_perm_reg;
  unsigned int zero_reg;
  unsigned int scf_vreg;
  unsigned int aux_vreg;
  unsigned int vec_x2;
  unsigned int vec_nom;
  unsigned int vec_denom;
  unsigned int vec_c0;
  unsigned int vec_c1;
  unsigned int vec_c2;
  unsigned int vec_c3;
  unsigned int vec_c1_d;
  unsigned int vec_c2_d;
  unsigned int vec_c3_d;
  unsigned int vec_hi_bound;
  unsigned int vec_lo_bound;
  unsigned int vec_ones;
  unsigned int vec_neg_ones;
  unsigned int vec_halves;
  unsigned int mask_hi;
  unsigned int mask_lo;
  unsigned int perm_table_vnni_lo;
  unsigned int perm_table_vnni_hi;
  unsigned int norm_to_normT_mask_reg_0;
  unsigned int norm_to_normT_mask_reg_1;
  unsigned int mask_m_fp32;
  unsigned int mask_m_bf16;
  unsigned int mask_m_lp_cvt;
  unsigned int mask_lo_i4;
  unsigned int mask_hi_i4;
  unsigned int perm_table_zpt_bcast;
  unsigned int luth_reg0;
  unsigned int luth_reg1;
  unsigned int lutl_reg0;
  unsigned int lutl_reg1;
  unsigned int sign_reg;
  unsigned int blend_reg;
  unsigned int tmp_reg0;
  unsigned int tmp_reg1;

  /* Auxiliary arrays for micro-kernel iteration space traversal */
  int use_paired_tilestores;
  int m_tiles;
  int n_tiles;
  int _im[4];
  int _in[4];
  int _C_tile_id[4];
  int _C_tile_mate_id[4];
  int _im_offset_prefix_sums[4];
  int _in_offset_prefix_sums[4];
  libxsmm_blocking_info_t m_blocking_info[2];

  /* Auxiliary data structure and fields when emulating AMX instructions */
  libxsmm_tile_config tile_config;
  unsigned int gemm_scratch_ld;
  unsigned int emulate_cvt2bf16fp32;
  unsigned int emulate_cvt2bf16fp32_vperm;
  unsigned int emulate_cvt2bf16fp32_vaux;
  unsigned int emulate_cvt2bf16fp32_vaux0;
  unsigned int emulate_cvt2bf16fp32_vaux1;
  unsigned int mask_cvt_hi;
  unsigned int mask_cvt_lo;
  libxsmm_loop_label_tracker *io_loop_label_tracker;

  /* Auxiliary fields to propagate kernel info */
  unsigned int m_remainder;
  unsigned int br_loop_index;
  unsigned int cur_unroll_factor;
  unsigned int is_peeled_br_loop;
  libxsmm_jump_label_tracker *p_jump_label_tracker;
  unsigned int loop_label_id;
  unsigned int k_amx_microkernel;
  unsigned int B_offs_trans;
  unsigned int stride_b_trans;

  /* Auxiliary fields for LP emulations and stack-based data prepartion */
  unsigned int bf8_gemm_via_stack_alloc_tensors;
  unsigned int hf8_gemm_via_stack_alloc_tensors;
  unsigned int atrans_gemm_stack_alloc_tensors;
  unsigned int avnni_gemm_stack_alloc_tensors;
  unsigned int avnni_gemm_sw_pipeline;
  unsigned int atvnni_gemm_stack_alloc_tensors;
  unsigned int avnni_btrans_gemm_stack_alloc_tensors;
  unsigned int atvnni_btrans_gemm_stack_alloc_tensors;
  unsigned int bvnni_btrans_gemm_stack_alloc_tensors;
} libxsmm_micro_kernel_config;

/* structure for storing the current gp reg mapping */
LIBXSMM_EXTERN_C typedef struct libxsmm_gp_reg_mapping_struct {
  unsigned int gp_reg_param_struct;
  unsigned int gp_reg_a;
  unsigned int gp_reg_a_base;
  unsigned int gp_reg_b;
  unsigned int gp_reg_b_base;
  unsigned int gp_reg_c;
  unsigned int gp_reg_a_prefetch;
  unsigned int gp_reg_a_offset;
  unsigned int gp_reg_b_prefetch;
  unsigned int gp_reg_b_offset;
/*  unsigned int gp_reg_c_prefetch;*/
  unsigned int gp_reg_mloop;
  unsigned int gp_reg_nloop;
  unsigned int gp_reg_kloop;
  unsigned int gp_reg_reduce_count;
  unsigned int gp_reg_reduce_loop;
  unsigned int gp_reg_a_ptrs;
  unsigned int gp_reg_b_ptrs;
  unsigned int gp_reg_lda;
  unsigned int gp_reg_ldb;
  unsigned int gp_reg_ldc;
  unsigned int gp_reg_scf;
  unsigned int gp_reg_zpt;
  unsigned int gp_reg_help_0;
  unsigned int gp_reg_help_1;
  unsigned int gp_reg_help_2;
  unsigned int gp_reg_help_3;
  unsigned int gp_reg_help_4;
  unsigned int gp_reg_help_5;
  unsigned int gp_reg_help_6;
/* Auxiliary regs for sparsity in A support  */
  unsigned int gp_reg_bitmap_a;
  unsigned int gp_reg_decompressed_a;
  unsigned int gp_reg_decompressed_elts;
  unsigned int gp_reg_popcnt;
} libxsmm_gp_reg_mapping;

/* structure for storing the current gp reg mapping for matcopy */
LIBXSMM_EXTERN_C typedef struct libxsmm_matcopy_gp_reg_mapping_struct {
  unsigned int gp_reg_a;
  unsigned int gp_reg_lda;
  unsigned int gp_reg_b;
  unsigned int gp_reg_ldb;
  unsigned int gp_reg_a_pf;
  unsigned int gp_reg_b_pf;
  unsigned int gp_reg_m_loop;
  unsigned int gp_reg_n_loop;
  unsigned int gp_reg_help_0;
} libxsmm_matcopy_gp_reg_mapping;

/* matcopy kernel configuration */
LIBXSMM_EXTERN_C typedef struct libxsmm_matcopy_kernel_config_struct {
  unsigned int instruction_set;
  unsigned int vector_reg_count;
  unsigned int vector_length;
  unsigned int datatype_size;
  unsigned int prefetch_instruction;
  unsigned int vmove_instruction;
  unsigned int alu_add_instruction;
  unsigned int alu_cmp_instruction;
  unsigned int alu_jmp_instruction;
  unsigned int alu_mov_instruction;
  unsigned int vxor_instruction;
  char vector_name;
} libxsmm_matcopy_kernel_config;

/* structure for storing the current gp reg mapping for mateltwise */
LIBXSMM_EXTERN_C typedef struct libxsmm_mateltwise_gp_reg_mapping_struct {
  unsigned int gp_reg_param_struct;
  unsigned int gp_reg_in;
  unsigned int gp_reg_in2;
  unsigned int gp_reg_in3;
  unsigned int gp_reg_in_pf;
  unsigned int gp_reg_ldi;
  unsigned int gp_reg_out;
  unsigned int gp_reg_out2;
  unsigned int gp_reg_ldo;
  unsigned int gp_reg_relumask;
  unsigned int gp_reg_fam_lualpha;
  unsigned int gp_reg_offset;
  unsigned int gp_reg_offset_2;
  unsigned int gp_reg_dropoutmask;
  unsigned int gp_reg_dropoutprob;
  unsigned int gp_reg_prngstate;
  unsigned int gp_reg_reduced_elts;
  unsigned int gp_reg_reduced_elts_squared;
  unsigned int gp_reg_scale_vals;
  unsigned int gp_reg_shift_vals;
  unsigned int gp_reg_bias_vals;
  unsigned int gp_reg_scale_vals2;
  unsigned int gp_reg_shift_vals2;
  unsigned int gp_reg_bias_vals2;
  unsigned int gp_reg_m_loop;
  unsigned int gp_reg_n_loop;
  unsigned int gp_reg_n;
  unsigned int gp_reg_ind_base;
  unsigned int gp_reg_in_base;
  unsigned int gp_reg_invec;
  unsigned int gp_reg_ind_base2;
  unsigned int gp_reg_in_base2;
  unsigned int gp_reg_in_pf2;
  unsigned int gp_reg_scale_base;
  unsigned int gp_reg_quant_sf;
  unsigned int gp_reg_scratch_0;
  unsigned int gp_reg_scratch_1;
} libxsmm_mateltwise_gp_reg_mapping;

/* mateltwise kernel configuration */
LIBXSMM_EXTERN_C typedef struct libxsmm_mateltwise_kernel_config_struct {
  unsigned int instruction_set;
  unsigned int vector_reg_count;
  unsigned int datatype_size_in;
  unsigned int datatype_size_in1;
  unsigned int datatype_size_in2;
  unsigned int datatype_size_out;
  unsigned int vmove_instruction_in;
  unsigned int vmove_instruction_in1;
  unsigned int vmove_instruction_in2;
  unsigned int vmove_instruction_out;
  unsigned int alu_add_instruction;
  unsigned int alu_sub_instruction;
  unsigned int alu_cmp_instruction;
  unsigned int alu_jmp_instruction;
  unsigned int alu_mov_instruction;
  unsigned int vxor_instruction;

  /* some helper values for kernels using bitmasks */
  unsigned int ldi_mask;
  unsigned int ldo_mask;

  /* Auxiliary variables for vreg management */
  unsigned int reserved_zmms;
  unsigned int reserved_mask_regs;
  unsigned int use_fp32bf16_cvt_replacement;
  unsigned int dcvt_mask_aux0;
  unsigned int dcvt_mask_aux1;
  unsigned int dcvt_mask_aux2;
  unsigned int dcvt_zmm_aux0;
  unsigned int dcvt_zmm_aux1;
  unsigned int dcvt_zmm_aux2;
  unsigned int dcvt_zmm_aux3;
  unsigned int inout_vreg_mask;
  unsigned int tmp_vreg;
  unsigned int tmp_vreg2;
  unsigned int tmp_vreg3;
  unsigned int zero_vreg;
  unsigned int vec_x2;
  unsigned int vec_nom;
  unsigned int vec_denom;
  unsigned int vec_c0;
  unsigned int vec_c01;
  unsigned int vec_c02;
  unsigned int vec_c03;
  unsigned int vec_c1;
  unsigned int vec_c11;
  unsigned int vec_c12;
  unsigned int vec_c13;
  unsigned int vec_c2;
  unsigned int vec_c21;
  unsigned int vec_c22;
  unsigned int vec_c23;
  unsigned int vec_c1_d;
  unsigned int vec_c2_d;
  unsigned int vec_c3_d;
  unsigned int vec_c3;
  unsigned int vec_hi_bound;
  unsigned int vec_lo_bound;
  unsigned int vec_ones;
  unsigned int vec_neg_ones;
  unsigned int vec_halves;
  unsigned int mask_hi;
  unsigned int mask_lo;
  unsigned int blend_tmp_mask;

  /* Additional aux variables for exp */
  unsigned int vec_log2e;
  unsigned int vec_ln2;
  unsigned int vec_c4;
  unsigned int vec_c5;
  unsigned int vec_y;
  unsigned int vec_z;
  unsigned int vec_expmask;
  unsigned int vec_logfmax;
  unsigned int vec_logfmin;
  unsigned int aux_mask;

  /* Additional aux variables for gelu */
  unsigned int vec_xr;
  unsigned int vec_xa;
  unsigned int vec_index;
  unsigned int vec_C0;
  unsigned int vec_C1;
  unsigned int vec_C2;
  unsigned int vec_thres;
  unsigned int vec_absmask;
  unsigned int vec_scale;
  unsigned int vec_shifter;

  /* Additional aux variables fir minimax approximations */
  unsigned int vec_c0_lo;
  unsigned int vec_c0_hi;
  unsigned int vec_c1_lo;
  unsigned int vec_c1_hi;
  unsigned int vec_c2_lo;
  unsigned int vec_c2_hi;
  unsigned int vec_tmp0;
  unsigned int vec_tmp1;
  unsigned int vec_tmp2;
  unsigned int vec_tmp3;
  unsigned int vec_tmp4;
  unsigned int vec_tmp5;
  unsigned int vec_tmp6;
  unsigned int vec_tmp7;
  int rbp_offs_thres;
  int rbp_offs_signmask;
  int rbp_offs_absmask;
  int rbp_offs_scale;
  int rbp_offs_shifter;
  int rbp_offs_half;

  /* Aux variables for relu variants */
  unsigned int mask_helper0_vreg;
  unsigned int mask_helper1_vreg;
  unsigned int fam_lu_vreg_alpha;

  /* Aux variable for dropout */
  unsigned int prng_state0_vreg;
  unsigned int prng_state1_vreg;
  unsigned int prng_state2_vreg;
  unsigned int prng_state3_vreg;
  unsigned int dropout_vreg_tmp0;
  unsigned int dropout_vreg_tmp1;
  unsigned int dropout_vreg_tmp2;
  unsigned int dropout_vreg_one;
  unsigned int dropout_vreg_zero;
  unsigned int dropout_prob_vreg;
  unsigned int dropout_invprob_vreg;
  unsigned int dropout_vreg_avxmask;

  /* aux variable for stochastic rounding */
  unsigned int prng_vreg_tmp0;
  unsigned int prng_vreg_tmp1;
  unsigned int prng_vreg_rand;

  /* aux variable for quantization */
  unsigned int quant_vreg_scf;

  /* Misc aux variables */
  unsigned int neg_signs_vreg;

  /* Aux variables for kernel config */
  unsigned int vlen_in;
  unsigned int vlen_in1;
  unsigned int vlen_in2;
  unsigned int vlen_out;
  unsigned int vlen_comp;
  unsigned int loop_order;
  unsigned int skip_pushpops_callee_gp_reg;
  unsigned int use_stack_vars;
  char vector_name;
} libxsmm_mateltwise_kernel_config;

/* structure for storing the current gp reg mapping for matequation */
LIBXSMM_EXTERN_C typedef struct libxsmm_matequation_gp_reg_mapping_struct {
  unsigned int                      gp_reg_param_struct;
  unsigned int gp_reg_in;
  unsigned int gp_reg_in2;
  unsigned int gp_reg_in_pf;
  unsigned int gp_reg_ldi;
  unsigned int gp_reg_out;
  unsigned int gp_reg_ldo;
  unsigned int gp_reg_relumask;
  unsigned int gp_reg_m_loop;
  unsigned int gp_reg_n_loop;
  unsigned int gp_reg_n;
  unsigned int gp_reg_offset;
  unsigned int temp_reg;
  unsigned int temp_reg2;
  unsigned int temp_reg3;
  unsigned int gp_reg_scratch_0;
  unsigned int gp_reg_scratch_1;
  unsigned int gp_reg_scratch_2;
  libxsmm_mateltwise_gp_reg_mapping gp_reg_mapping_eltwise;
  libxsmm_gp_reg_mapping            gp_reg_mapping_gemm;
} libxsmm_matequation_gp_reg_mapping;

/* matequation kernel configuration */
LIBXSMM_EXTERN_C typedef struct libxsmm_matequation_kernel_config_struct {
  unsigned int instruction_set;
  unsigned int vector_reg_count;
  unsigned int datatype_size_in;
  unsigned int datatype_size_out;
  unsigned int vmove_instruction_in;
  unsigned int vmove_instruction_out;
  unsigned int alu_add_instruction;
  unsigned int alu_sub_instruction;
  unsigned int alu_cmp_instruction;
  unsigned int alu_jmp_instruction;
  unsigned int alu_mov_instruction;
  unsigned int vxor_instruction;
  unsigned int skip_pushpops_callee_gp_reg;
  unsigned int n_args;
  unsigned int n_opargs;
  unsigned int vlen_in;
  unsigned int vlen_comp;
  unsigned int vlen_out;
  char vector_name;
  unsigned int                      in_f32_mask;
  unsigned int                      in_bf16_mask;
  unsigned int                      out_f32_mask;
  unsigned int                      out_bf16_mask;
  unsigned int                      full_vlen_bf16_mask;
  unsigned int                      is_head_reduce_to_scalar;
  unsigned int                      inout_vreg_mask;
  unsigned int                      inout_dump_mask;
  libxsmm_datatype                  dtype_out_masked;
  libxsmm_datatype                  dtype_in_masked;
  unsigned int                      m_in_masked;
  unsigned int                      m_out_masked;
  unsigned int                      out_mask;
  unsigned int                      cvt_result_to_bf16;
  unsigned int                      use_fp32bf16_cvt_replacement;
  unsigned int                      cvt_result_to_f16;
  unsigned int                      cvt_result_to_bf8;
  unsigned int                      cvt_result_to_hf8;
  unsigned int                      tmp_vreg;
  unsigned int                      dcvt_mask_aux0;
  unsigned int                      dcvt_mask_aux1;
  unsigned int                      dcvt_mask_aux2;
  unsigned int                      dcvt_zmm_aux0;
  unsigned int                      dcvt_zmm_aux1;
  unsigned int                      dcvt_zmm_aux2;
  unsigned int                      dcvt_zmm_aux3;
  unsigned int                      reduce_vreg;
  unsigned int                      n_avail_gpr;
  unsigned int                      gpr_pool[32];
  unsigned int                      n_tmp_reg_blocks;
  unsigned int                      contains_binary_op;
  unsigned int                      contains_ternary_op;
  unsigned int                      tmp_size;
  libxsmm_meqn_arg         *arg_info;
  libxsmm_meqn_tmp_info       *oparg_info;
  unsigned int                      reserved_zmms;
  unsigned int                      reserved_mask_regs;
  unsigned int                      register_block_size;
  unsigned int                      unary_ops_pool[64];
  unsigned int                      binary_ops_pool[64];
  libxsmm_mateltwise_kernel_config  meltw_kernel_config;
  libxsmm_micro_kernel_config       gemm_kernel_config;
} libxsmm_matequation_kernel_config;

/* structure for storing the current gp reg mapping for transpose */
LIBXSMM_EXTERN_C typedef struct libxsmm_transpose_gp_reg_mapping_struct {
  unsigned int gp_reg_a;
  unsigned int gp_reg_lda;
  unsigned int gp_reg_b;
  unsigned int gp_reg_ldb;
  unsigned int gp_reg_m_loop;
  unsigned int gp_reg_n_loop;
  unsigned int gp_reg_help_0;
  unsigned int gp_reg_help_1;
  unsigned int gp_reg_help_2;
  unsigned int gp_reg_help_3;
  unsigned int gp_reg_help_4;
  unsigned int gp_reg_help_5;
} libxsmm_transpose_gp_reg_mapping;

/* transpose kernel configuration */
LIBXSMM_EXTERN_C typedef struct libxsmm_transpose_kernel_config_struct {
  unsigned int instruction_set;
  unsigned int vector_reg_count;
  char vector_name;
} libxsmm_transpose_kernel_config;

typedef enum libxsmm_meltw_field_type {
  LIBXSMM_MELTW_FIELD_IN0              =  0,
  LIBXSMM_MELTW_FIELD_IN1              =  1,
  LIBXSMM_MELTW_FIELD_IN2              =  2,
  LIBXSMM_MELTW_FIELD_OUT              =  3,
  LIBXSMM_MELTW_FIELD_COMP             =  4
} libxsmm_meltw_field_type;

/* Auxiliary stack variable enumeration for kernels */
typedef enum libxsmm_meltw_stack_var {
  LIBXSMM_MELTW_STACK_VAR_NONE            =  0,
  LIBXSMM_MELTW_STACK_VAR_INP0_PTR0       =  1,
  LIBXSMM_MELTW_STACK_VAR_INP0_PTR1       =  2,
  LIBXSMM_MELTW_STACK_VAR_INP0_PTR2       =  3,
  LIBXSMM_MELTW_STACK_VAR_INP1_PTR0       =  4,
  LIBXSMM_MELTW_STACK_VAR_INP1_PTR1       =  5,
  LIBXSMM_MELTW_STACK_VAR_INP1_PTR2       =  6,
  LIBXSMM_MELTW_STACK_VAR_INP2_PTR0       =  7,
  LIBXSMM_MELTW_STACK_VAR_INP2_PTR1       =  8,
  LIBXSMM_MELTW_STACK_VAR_INP2_PTR2       =  9,
  LIBXSMM_MELTW_STACK_VAR_OUT_PTR0        =  10,
  LIBXSMM_MELTW_STACK_VAR_OUT_PTR1        =  11,
  LIBXSMM_MELTW_STACK_VAR_OUT_PTR2        =  12,
  LIBXSMM_MELTW_STACK_VAR_SCRATCH_PTR     =  13,
  LIBXSMM_MELTW_STACK_VAR_CONST_0         =  14,
  LIBXSMM_MELTW_STACK_VAR_CONST_1         =  15,
  LIBXSMM_MELTW_STACK_VAR_CONST_2         =  16,
  LIBXSMM_MELTW_STACK_VAR_CONST_3         =  17,
  LIBXSMM_MELTW_STACK_VAR_CONST_4         =  18,
  LIBXSMM_MELTW_STACK_VAR_CONST_5         =  19,
  LIBXSMM_MELTW_STACK_VAR_CONST_6         =  20,
  LIBXSMM_MELTW_STACK_VAR_CONST_7         =  21,
  LIBXSMM_MELTW_STACK_VAR_CONST_8         =  22,
  LIBXSMM_MELTW_STACK_VAR_CONST_9         =  23,
  LIBXSMM_MELTW_STACK_VAR_OP_ARG_0        =  24,
  LIBXSMM_MELTW_STACK_VAR_OP_ARG_1        =  25,
  LIBXSMM_MELTW_STACK_VAR_OP_ARG_2        =  26,
  LIBXSMM_MELTW_STACK_VAR_OP_ARG_3        =  27
} libxsmm_meltw_stack_var;

typedef enum libxsmm_meqn_stack_var {
  LIBXSMM_MEQN_STACK_VAR_NONE               =  0,
  LIBXSMM_MEQN_STACK_VAR_SCRATCH_PTR        =  1,
  LIBXSMM_MEQN_STACK_VAR_ADDR_SCRATCH_PTR   =  2,
  LIBXSMM_MEQN_STACK_VAR_OUT_PTR            =  3,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR0  =  4,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR1  =  5,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR2  =  6,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR3  =  7,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR4  =  8,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR5  =  9,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR6  =  10,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR7  =  11,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR8  =  12,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR9  =  13,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR10 =  14,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR11 =  15,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR12 =  16,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR13 =  17,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR14 =  18,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR15 =  19,
  LIBXSMM_MEQN_STACK_VAR_CONST_0            =  20,
  LIBXSMM_MEQN_STACK_VAR_CONST_1            =  21,
  LIBXSMM_MEQN_STACK_VAR_CONST_2            =  22,
  LIBXSMM_MEQN_STACK_VAR_CONST_3            =  23,
  LIBXSMM_MEQN_STACK_VAR_CONST_4            =  24,
  LIBXSMM_MEQN_STACK_VAR_CONST_5            =  25,
  LIBXSMM_MEQN_STACK_VAR_CONST_6            =  26,
  LIBXSMM_MEQN_STACK_VAR_CONST_7            =  27,
  LIBXSMM_MEQN_STACK_VAR_CONST_8            =  28,
  LIBXSMM_MEQN_STACK_VAR_CONST_9            =  29,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR16 =  30,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR17 =  31,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR18 =  32,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR19 =  33,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR20 =  34,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR21 =  35,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR22 =  36,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR23 =  37,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR24 =  38,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR25 =  39,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR26 =  40,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR27 =  41,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR28 =  42,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR29 =  43,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR30 =  44,
  LIBXSMM_MEQN_STACK_VAR_PARAM_STRUCT_PTR31 =  45
} libxsmm_meqn_stack_var;

/* Auxiliary stack variable enumeration in GEMM */
typedef enum libxsmm_gemm_stack_var {
  LIBXSMM_GEMM_STACK_VAR_NONE                   =  0,
  LIBXSMM_GEMM_STACK_VAR_PFA_PTR                =  1,
  LIBXSMM_GEMM_STACK_VAR_PFB_PTR                =  2,
  LIBXSMM_GEMM_STACK_VAR_A_OFFS_BRGEMM_PTR      =  3,
  LIBXSMM_GEMM_STACK_VAR_B_OFFS_BRGEMM_PTR      =  4,
  LIBXSMM_GEMM_STACK_VAR_INT8_SCF               =  5,
  LIBXSMM_GEMM_STACK_VAR_GEMM_SCRATCH_PTR       =  6,
  LIBXSMM_GEMM_STACK_VAR_ELT_BIAS_PTR           =  7,
  LIBXSMM_GEMM_STACK_VAR_ELT_OUTPUT_PTR         =  8,
  LIBXSMM_GEMM_STACK_VAR_ARG_7                  =  9,
  LIBXSMM_GEMM_STACK_VAR_ARG_8                  = 10,
  LIBXSMM_GEMM_STACK_VAR_ARG_9                  = 11,
  LIBXSMM_GEMM_STACK_VAR_ARG_10                 = 12,
  LIBXSMM_GEMM_STACK_VAR_ELT_BUF1               = 13,
  LIBXSMM_GEMM_STACK_VAR_ELT_BUF2               = 14,
  LIBXSMM_GEMM_STACK_VAR_ELT_BITMAP_PTR         = 15,
  LIBXSMM_GEMM_STACK_VAR_ELT_DECOMPRESS_BUF     = 16,
  LIBXSMM_GEMM_STACK_VAR_TRANS_EXT_BUF_B        = 17,
  LIBXSMM_GEMM_STACK_VAR_TRANS_EXT_BUF_C        = 18,
  LIBXSMM_GEMM_STACK_VAR_ELT_RELU_BITMASK_PTR   = 19,
  LIBXSMM_GEMM_STACK_VAR_BRCOUNT                = 20,
  LIBXSMM_GEMM_STACK_VAR_TRANSPOSE_PTR          = 21,
  LIBXSMM_GEMM_STACK_VAR_AVX2_MASK_PTR          = 22,
  LIBXSMM_GEMM_STACK_VAR_SSE_AVX2_LP_HELPER_PTR = 23,
  LIBXSMM_GEMM_STACK_VAR_A_EMU_PTR              = 24,
  LIBXSMM_GEMM_STACK_VAR_B_EMU_PTR              = 25,
  LIBXSMM_GEMM_STACK_VAR_MELTW_STRUCT_PTR       = 26,
  LIBXSMM_GEMM_STACK_VAR_A_SCRATCH_PTR          = 27,
  LIBXSMM_GEMM_STACK_VAR_C_SCRATCH_PTR          = 28,
  LIBXSMM_GEMM_STACK_VAR_C_OUTPUT_PTR           = 29,
  LIBXSMM_GEMM_STACK_VAR_BIAS_SCRATCH_PTR       = 30,
  LIBXSMM_GEMM_STACK_VAR_ZPT_PTR                = 31,
  LIBXSMM_GEMM_STACK_VAR_AUX_VAR                = 32,
  LIBXSMM_GEMM_STACK_VAR_MXSCALE_PTR            = 33,
  LIBXSMM_GEMM_STACK_VAR_SCF_BRGEMM_PTR         = 34,
  LIBXSMM_GEMM_STACK_VAR_ZPT_BRGEMM_PTR         = 35,
  LIBXSMM_GEMM_STACK_VAR_BSCALE_PTR             = 36,
  LIBXSMM_GEMM_STACK_VAR_BSCALE_BRGEMM_PTR      = 37

} libxsmm_gemm_stack_var;

#if 0
/* compressed meltw reduce structure */
typedef enum libxsmm_meltw_comp_redu_flags {
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_NONE         = 0,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD       = 1,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MAX       = 2,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_MUL       = 3,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_ROWS         = 4,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_COLS         = 5,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS         = 6,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_ELTS_SQUARED = 7,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_ROWS  = 8,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_COLS  = 9,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_ROWS_ELTS_ELTS_SQUARED  = 10,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_COLS_ELTS_ELTS_SQUARED  = 11,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_ROWS_ELTS               = 12,
  LIBXSMM_MELTW_COMP_FLAG_REDUCE_OP_ADD_COLS_ELTS               = 13
} libxsmm_meltw_comp_redu_flags;

/* compressed meltw relu structure */
typedef enum libxsmm_meltw_comp_relu_flags {
  LIBXSMM_MELTW_COMP_FLAG_RELU_NONE         = 0,
  LIBXSMM_MELTW_COMP_FLAG_RELU_FWD          = 1,
  LIBXSMM_MELTW_COMP_FLAG_RELU_BWD          = 2
} libxsmm_meltw_comp_relu_flags;

/* compressed meltw scale structure */
typedef enum libxsmm_meltw_comp_scal_flags {
  LIBXSMM_MELTW_COMP_FLAG_SCALE_NONE                     = 0,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT                     = 1,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT                    = 2,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS                 = 3,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ROWS                     = 4,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_COLS                     = 5,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ROWS                = 6,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_ROWS               = 7,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_ROWS            = 8,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ROWS          = 9,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS      = 10,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_ROWS       = 11,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS = 12,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_COLS                = 13,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_COLS               = 14,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_COLS            = 15,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_COLS          = 16,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_COLS      = 17,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_COLS       = 18,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_COLS = 19,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ROWS_COLS                = 20,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_SHIFT_ROWS_COLS               = 21,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_ROWS_COLS            = 22,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ROWS_COLS          = 23,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_ADD_BIAS_SHIFT_ROWS_COLS      = 24,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_ADD_BIAS_ROWS_COLS       = 25,
  LIBXSMM_MELTW_COMP_FLAG_SCALE_MULT_SHIFT_ADD_BIAS_ROWS_COLS = 26
} libxsmm_meltw_comp_scal_flags;

/* compressed metlw cvta structure */
typedef enum libxsmm_meltw_comp_cvta_flags {
  LIBXSMM_MELTW_COMP_FLAG_CVTA_NONE           = 0,
  LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_RELU      = 1,
  LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_TANH      = 2,
  LIBXSMM_MELTW_COMP_FLAG_CVTA_FUSE_SIGM      = 3
} libxsmm_meltw_comp_cvta_flags;

/* compressed meltw acvt structure */
typedef enum libxsmm_meltw_comp_acvt_flags {
  LIBXSMM_MELTW_COMP_FLAG_ACVT_NONE           = 0,
  LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_TANH      = 1,
  LIBXSMM_MELTW_COMP_FLAG_ACVT_FUSE_SIGM      = 2
} libxsmm_meltw_comp_acvt_flags;

/* compressed meltw cbiasact structure */
typedef enum libxsmm_meltw_comp_flags {
  LIBXSMM_MELTW_COMP_FLAG_NONE                         =  0,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS                      =  1,
  LIBXSMM_MELTW_COMP_FLAG_ACT_RELU                     =  2,
  LIBXSMM_MELTW_COMP_FLAG_ACT_TANH                     =  3,
  LIBXSMM_MELTW_COMP_FLAG_ACT_SIGM                     =  4,
  LIBXSMM_MELTW_COMP_FLAG_ACT_GELU                     =  5,
  LIBXSMM_MELTW_COMP_FLAG_OVERWRITE_C                  =  6,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_RELU             =  7,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_TANH             =  8,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_SIGM             =  9,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_GELU             = 10,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_RELU_OVERWRITE_C = 11,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_TANH_OVERWRITE_C = 12,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_SIGM_OVERWRITE_C = 13,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_ACT_GELU_OVERWRITE_C = 14,
  LIBXSMM_MELTW_COMP_FLAG_COLBIAS_OVERWRITE_C          = 15,
  LIBXSMM_MELTW_COMP_FLAG_ACT_RELU_OVERWRITE_C         = 16,
  LIBXSMM_MELTW_COMP_FLAG_ACT_TANH_OVERWRITE_C         = 17,
  LIBXSMM_MELTW_COMP_FLAG_ACT_SIGM_OVERWRITE_C         = 18,
  LIBXSMM_MELTW_COMP_FLAG_ACT_GELU_OVERWRITE_C         = 19
} libxsmm_meltw_comp_flags;
#endif

LIBXSMM_API_INTERN
int libxsmm_meltw_getenum_precision( const libxsmm_meltw_descriptor* i_mateltwise_desc,
                                     libxsmm_meltw_field_type        type);

LIBXSMM_API_INTERN
void libxsmm_reset_loop_label_tracker( libxsmm_loop_label_tracker* io_loop_label_tracker );

LIBXSMM_API_INTERN
void libxsmm_reset_jump_label_tracker( libxsmm_jump_label_tracker* io_jump_lable_tracker );

LIBXSMM_API_INTERN
void libxsmm_reset_const_data_tracker( libxsmm_const_data_tracker* io_const_data_tracker );

LIBXSMM_API_INTERN
void libxsmm_get_x86_gp_reg_name( const unsigned int i_gp_reg_number,
                                  char*              o_gp_reg_name,
                                  const int          i_gp_reg_name_max_length );

LIBXSMM_API_INTERN
unsigned int libxsmm_check_x86_gp_reg_callee_save( const unsigned int i_gp_reg_number );

LIBXSMM_API_INTERN
void libxsmm_get_x86_instr_name( const unsigned int i_instr_number,
                                 char*              o_instr_name,
                                 const int          i_instr_name_max_length );

LIBXSMM_API_INTERN
void libxsmm_reset_x86_gp_reg_mapping( libxsmm_gp_reg_mapping* io_gp_reg_mapping );

LIBXSMM_API_INTERN
void libxsmm_reset_aarch64_gp_reg_mapping( libxsmm_gp_reg_mapping* io_gp_reg_mapping );

LIBXSMM_API_INTERN
unsigned int libxsmm_is_x86_vec_instr_single_precision( const unsigned int i_instr_number );

/* some string manipulation helper needed to generated code */
LIBXSMM_API_INTERN
void libxsmm_append_code_as_string( libxsmm_generated_code* io_generated_code,
                                    const char*             i_code_to_append,
                                    const int               i_append_length );

LIBXSMM_API_INTERN
void libxsmm_close_function( libxsmm_generated_code* io_generated_code );

LIBXSMM_API_INTERN
void libxsmm_mmfunction_signature( libxsmm_generated_code*       io_generated_code,
                                  const char*                    i_routine_name,
                                  const libxsmm_gemm_descriptor* i_xgemm_desc );

LIBXSMM_API_INTERN
void libxsmm_generator_isa_check_header( libxsmm_generated_code* io_generated_code );

LIBXSMM_API_INTERN
void libxsmm_generator_isa_check_footer( libxsmm_generated_code* io_generated_code );

LIBXSMM_API_INTERN
int libxsmm_get_handle_error(void);

LIBXSMM_API_INTERN
void libxsmm_set_handle_error(int enable);

LIBXSMM_API_INTERN
void libxsmm_handle_error( libxsmm_generated_code* io_generated_code,
                           const unsigned int i_error_code,
                           /** Contextual information (source of error), e.g., function name. */
                           const char context[],
                           /** Filename related to source of error (like context). */
                           const char srcfile[],
                           /** Line number, i.e., not considered if less or equal to zero. */
                           int linenum,
                           /** Whether to emit (non-zero), or suppress (zero) any message. */
                           int emit_message );

LIBXSMM_API_INTERN unsigned int libxsmm_compute_equalized_blocking(
  unsigned int i_size, unsigned int i_max_block,
  unsigned int* o_range_1, unsigned int* o_block_1,
  unsigned int* o_range_2, unsigned int* o_block_2 );

typedef enum libxsmm_ulp_precision {
  LIBXSMM_ULP_PRECISION_HALF_ULP, /* rounded correctly */
  LIBXSMM_ULP_PRECISION_ONE_ULP, /* perfect except for last bit */
  LIBXSMM_ULP_PRECISION_ESTIMATE /* can be pretty bad, but should have the correct order of magnitude */
} libxsmm_ulp_precision;

/** returns the targeted precision for kernels, e.g. 1 for 1 ulp (close to perfect), 0.5 for half an ulp (perfect), or estimate for just an estimate
 * can be set with the environment variable LIBXSMM_ULP_PRECISION={0.5, 1, ESTIMATE}
 */
LIBXSMM_API_INTERN libxsmm_ulp_precision libxsmm_get_ulp_precision(void);

LIBXSMM_API_INTERN int LIBXSMM_GEMM_GETENUM_A_PREC(const unsigned char *datatype);
LIBXSMM_API_INTERN int LIBXSMM_GEMM_GETENUM_B_PREC(const unsigned char *datatype);
LIBXSMM_API_INTERN int LIBXSMM_GEMM_GETENUM_C_PREC(const unsigned char *datatype);
LIBXSMM_API_INTERN int LIBXSMM_GEMM_GETENUM_AB_COMMON_PREC(const unsigned char *datatype);
LIBXSMM_API_INTERN int LIBXSMM_GEMM_GETENUM_ABC_COMMON_PREC(const unsigned char *datatype);
LIBXSMM_API_INTERN int LIBXSMM_GEMM_GETENUM_COMP_PREC(const unsigned char *datatype);
LIBXSMM_API_INTERN void LIBXSMM_GEMM_SET_DESC_DATATYPE(libxsmm_datatype a_dt, libxsmm_datatype b_dt, libxsmm_datatype c_dt, libxsmm_datatype comp_dt, unsigned char *out_datatype);

#endif /* GENERATOR_COMMON_H */
